I have a CUDA kernel which essentially looks like the following.
__global__ void myOpKernel(double *vals, double *data, int *nums, double *crit, int N, int K) {
int index = blockIdx.x*blockDim.x + threadIdx.x;
if (index >= N) return;
double _crit = crit[index];
for (int i=0; i<nums[index]; i++) {
double _res = vals[index*K + i];
if (data[index*K + i] >= _crit) { _res = 0.0; }
vals[index*K + i] = _res;
}
}
This kernel evaluates vals[N*K]
based on its data[N*K]
compared to crit[N]
, and the comparison is conducted on the first nums[N]
elements of the vals
's segment (width K). If the data
is smaller than crit
, it leaves vals
unchanged.
For example, data under consideration will look like the following
int N = 3;
int K = 5;
vals[ 0] = 1.0; data[ 0] = 5.1; crit[0] = 5.0; nums[0] = 3;
vals[ 1] = 1.0; data[ 1] = 4.9;
vals[ 2] = 1.0; data[ 2] = 3.0;
vals[ 3] = 0.0; data[ 3] = 0.0;
vals[ 4] = 0.0; data[ 4] = 0.0;
//-----------------------
vals[ 5] = 1.0; data[ 5] = 2.9; crit[1] = 3.0; nums[1] = 2;
vals[ 6] = 1.0; data[ 6] = 3.1;
vals[ 7] = 0.0; data[ 7] = 0.0;
vals[ 8] = 0.0; data[ 8] = 0.0;
vals[ 9] = 0.0; data[ 9] = 0.0;
//-----------------------
vals[10] = 1.0; data[10] = 8.1; crit[2] = 9.0; nums[2] = 5;
vals[11] = 1.0; data[11] = 7.8;
vals[12] = 1.0; data[12] = 9.1;
vals[13] = 1.0; data[13] = 200.;
vals[14] = 1.0; data[14] = -1.0;
I noticed that this kind of operation is one of top 3 time-consuming kernels, and am considering Thrust-based acceleration.
What I came up with so far looks like the following. It uses expand
provided on Thrust samples (https://github.com/NVIDIA/thrust/blob/master/examples/expand.cu).
struct myOp : public thrust::unary_function<thrust::tuple<double,double,int,int,int,double>, double> {
// vals data 1/K 1%K nums crit
__host__ __device__ // 0 1 2 3 4 5
double operator() (const thrust::tuple<double,double,int,int,int, double> &t) const {
double res;
if (thrust::get<2>(t) >= thrust::get<4>(t)) {
res = thrust::get<0>(t); // do nothing
}else {
if (thrust::get<3>(t) >= thrust::get<4>(t)) {
res = thrust::get<0>(t); // do nothing
}else {
double tmp = thrust::get<0>(t);
if (thrust::get<1>(t) >= thrust::get<5>(t)) { tmp = 0.0; }
res = tmp;
}
}
return res;
}
};
int main() {
using namespace thrust::placeholders;
thrust::device_vector<double> vals(N*K);
thrust::device_vector<double> data(N*K);
thrust::device_vector<double> crit(N);
thrust::device_vector<int> nums(N);
thrust::device_vector<double> res(N*K);
// ... fill values ...
thrust::device_vector<int> nums_expand(N*K);
thrust::device_vector<double> crit_expand(N*K);
// 'expand()' does something like [1,2,3] -> [1,1,1,2,2,2,3,3,3]
expand(thrust::constant_iterator<int>(K),
thrust::constant_iterator<int>(K)+N,
nums.begin(),
nums_expand.begin());
expand(thrust::constant_iterator<int>(K),
thrust::constant_iterator<int>(K)+N,
crit.begin(),
crit_expand.begin());
thrust::transform(thrust::make_zip_iterator(vals.begin(),
data.begin(),
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1/K), // index related to N
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1%K), // index related to K
nums_expand.begin(),
crit_expand.begin()),
thrust::make_zip_iterator(vals.end(),
data.end(),
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1/K) + N*K,
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1%K) + N*K,
nums_expand.end(),
crit_expand.end()),
res.begin(),
myOp());
...
}
When I tried this with arbitrary values in the arrays with sets of [N,K]
= [1000,256]
, [10000,256]
, [50000,256]
, [100000,256]
, already the performance is satisfactory.
But I wonder if there is any further chance of speed-up with my Thrust operations. I am expanding some values to take them into if
statements, but maybe this can be avoided by permutation_iterator
and so on, but I cannot come up with how. Also, I am doing _1/K
, _1%K
stuff to get the global and local index of the elements, which could be somehow avoided with more clever mind.
At least, for the cosmetics point of view, I would love to insert expand(...)
into thrust::transform(...)
directly without having to define another vector such as nums_expand
.
Any suggestions for any chance of improvements are welcome.
Full code used for the comparison
//https://stackoverflow.com/questions/31955505/can-thrust-transform-reduce-work-with-2-arrays%5B/url%5D
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/gather.h>
#include <thrust/copy.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
//#include <thrust/execution_policy.h>
#include <iostream>
#include <iomanip>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <helper_timer.h>
///////// https://github.com/NVIDIA/thrust/blob/master/examples/expand.cu //////////
template <typename InputIterator1,
typename InputIterator2,
typename OutputIterator>
OutputIterator expand(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
OutputIterator output)
{
typedef typename thrust::iterator_difference<InputIterator1>::type difference_type;
difference_type input_size = thrust::distance(first1, last1);
difference_type output_size = thrust::reduce(first1, last1);
// scan the counts to obtain output offsets for each input element
thrust::device_vector<difference_type> output_offsets(input_size, 0);
thrust::exclusive_scan(first1, last1, output_offsets.begin());
// scatter the nonzero counts into their corresponding output positions
thrust::device_vector<difference_type> output_indices(output_size, 0);
thrust::scatter_if
(thrust::counting_iterator<difference_type>(0),
thrust::counting_iterator<difference_type>(input_size),
output_offsets.begin(),
first1,
output_indices.begin());
// compute max-scan over the output indices, filling in the holes
thrust::inclusive_scan
(output_indices.begin(),
output_indices.end(),
output_indices.begin(),
thrust::maximum<difference_type>());
// gather input values according to index array (output = first2[output_indices])
thrust::gather(output_indices.begin(),
output_indices.end(),
first2,
output);
// return output + output_size
thrust::advance(output, output_size);
return output;
}
/////////////////////////////////////////////////////////////////////////////////////
template<typename T>
void print_vector(T& vec) {
for (const auto& elem : vec) {
std::cout << std::setw(5) << elem;
}
std::cout << std::endl;
}
void printSdkTimer(StopWatchInterface **timer, int average) {
float fAvgSeconds =
((float)1.0e-3 * (float)sdkGetTimerValue(timer) / (float)average);
printf(" - Elapsed time: %.5f sec \n", fAvgSeconds);
}
struct myOp : public thrust::unary_function<thrust::tuple<double,double,int,int,int,double>, double> {
// vals data 1/K 1%K nums crit
__host__ __device__ // 0 1 2 3 4 5
double operator() (const thrust::tuple<double,double,int,int,int, double> &t) const {
double res;
if (thrust::get<2>(t) >= thrust::get<4>(t)) {
res = thrust::get<0>(t); // do nothing
}else {
if (thrust::get<3>(t) >= thrust::get<4>(t)) {
res = thrust::get<0>(t); // do nothing
}else {
double tmp = thrust::get<0>(t);
if (thrust::get<1>(t) >= thrust::get<5>(t)) { tmp = 0.0; }
res = tmp;
}
}
return res;
}
};
__global__ void myOpKernel(double *vals, double *data, int *nums, double *crit, int N, int K) {
int index = blockIdx.x*blockDim.x + threadIdx.x;
if (index >= N) return;
double _crit = crit[index];
for (int i=0; i<nums[index]; i++) {
double _res = vals[index*K + i];
if (data[index*K + i] >= _crit) { _res = 0.0; }
vals[index*K + i] = _res;
}
}
int main(int argc, char **argv) {
using namespace thrust::placeholders;
int N = atoi(argv[1]);
int K = atoi(argv[2]);
std::cout << "N " << N << " K " << K << std::endl;
thrust::device_vector<double> vals(N*K);
thrust::device_vector<double> data(N*K);
thrust::device_vector<double> crit(N);
thrust::device_vector<int> nums(N);
thrust::device_vector<double> res(N*K);
for (int i=0; i<N; i++) {
crit[i] = 101.0; // arbitrary
nums[i] = 200; // arbitrary number less than 256
for (int j=0; j<K; j++) {
vals[i*K + j] = (double)(i*K + j); // arbitrary
data[i*K + j] = (double)(i*K + j); // arbitrary
}
}
// to be used for kernel
thrust::device_vector<double> vals2 = vals;
thrust::device_vector<double> data2 = data;
thrust::device_vector<double> crit2 = crit;
thrust::device_vector<int> nums2 = nums;
StopWatchInterface *timer=NULL;
//--- 1) thrust
thrust::device_vector<int> nums_expand(N*K);
thrust::device_vector<double> crit_expand(N*K);
expand(thrust::constant_iterator<int>(K),
thrust::constant_iterator<int>(K)+N,
nums.begin(),
nums_expand.begin());
expand(thrust::constant_iterator<int>(K),
thrust::constant_iterator<int>(K)+N,
crit.begin(),
crit_expand.begin());
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
thrust::transform(thrust::make_zip_iterator(vals.begin(),
data.begin(),
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1/K), // for N
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1%K), // for K
nums_expand.begin(),
crit_expand.begin()),
thrust::make_zip_iterator(vals.end(),
data.end(),
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1/K) + N*K,
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1%K) + N*K,
nums_expand.end(),
crit_expand.end()),
res.begin(),
myOp());
sdkStopTimer(&timer);
printSdkTimer(&timer,1);
cudaDeviceSynchronize();
sdkResetTimer(&timer);
sdkStartTimer(&timer);
//--- 2) kernel
double *raw_vals2 = thrust::raw_pointer_cast(vals2.data());
double *raw_data2 = thrust::raw_pointer_cast(data2.data());
double *raw_crit2 = thrust::raw_pointer_cast(crit2.data());
int *raw_nums2 = thrust::raw_pointer_cast(nums2.data());
int Nthreads = 256;
int Nblocks = (N*K - 1) / Nthreads + 1;
myOpKernel<<<Nblocks,Nthreads>>>(raw_vals2, raw_data2, raw_nums2, raw_crit2, N, K);
cudaDeviceSynchronize();
sdkStopTimer(&timer);
printSdkTimer(&timer,1);
sdkDeleteTimer(&timer);
return 0;
}
Below is some modified benchmark code with improved kernels. Compiled with nvcc --extended-lambda -arch=sm_89 -O3 main.cu -o main
Since the timer is not included in your code, I use cudaEvents instead. Data buffers are initialized in host vectors to avoid millions of memcopies. I also noticed that the thrust approach does not produce identical results to your kernel for large N.
I added two kernels. myOpKernel3
simply uses 1 threadblock per index to access the num[index] values.
myOpKernel4
uses 1 thread per output element. this requires a prefix sum of nums, and the computation of index
per thread. I chose to precompute the indices. An alternative approach would be to perform a binary search on the prefix sum within the kernel.
For full segments, i.e. nums[i] = 256, the output is
N 100000 K 256
expandtime 6.28806 ms
thrusttransformtime 1.05165 ms
myOpKerneltime 2.04301 ms
results from thrust and myOpKernel do not match
myOpKerneltime3 0.662016 ms
myOpKernel4_setuptime 0.723872 ms
myOpKerneltime4 0.785408 ms
For nums[i] = 128
N 100000 K 256
expandtime 6.2976 ms
thrusttransformtime 1.05472 ms
myOpKerneltime 1.04054 ms
results from thrust and myOpKernel do not match
myOpKerneltime3 0.337952 ms
myOpKernel4_setuptime 0.369152 ms
myOpKerneltime4 0.386048 ms
nums[i] = 4
N 100000 K 256
expandtime 6.2936 ms
thrusttransformtime 1.05165 ms
myOpKerneltime 0.273536 ms
results from thrust and myOpKernel do not match
myOpKerneltime3 0.119104 ms
myOpKernel4_setuptime 0.293824 ms
myOpKerneltime4 0.066848 ms
I did not test non-uniform segment sizes.
Note that performance costs of temporary memory allocations and thrust calls can be reduced by using custom allocators in conjunction with thrust's thrust::cuda::par_nosync
execution policy.
//https://stackoverflow.com/questions/31955505/can-thrust-transform-reduce-work-with-2-arrays%5B/url%5D
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/gather.h>
#include <thrust/copy.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/execution_policy.h>
#include <thrust/host_vector.h>
#include <iostream>
#include <iomanip>
#include <thrust/transform.h>
#include <thrust/functional.h>
///////// https://github.com/NVIDIA/thrust/blob/master/examples/expand.cu //////////
template <typename InputIterator1,
typename InputIterator2,
typename OutputIterator>
OutputIterator expand(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
OutputIterator output)
{
typedef typename thrust::iterator_difference<InputIterator1>::type difference_type;
difference_type input_size = thrust::distance(first1, last1);
difference_type output_size = thrust::reduce(first1, last1);
// scan the counts to obtain output offsets for each input element
thrust::device_vector<difference_type> output_offsets(input_size, 0);
thrust::exclusive_scan(first1, last1, output_offsets.begin());
// scatter the nonzero counts into their corresponding output positions
thrust::device_vector<difference_type> output_indices(output_size, 0);
thrust::scatter_if
(thrust::counting_iterator<difference_type>(0),
thrust::counting_iterator<difference_type>(input_size),
output_offsets.begin(),
first1,
output_indices.begin());
// compute max-scan over the output indices, filling in the holes
thrust::inclusive_scan
(output_indices.begin(),
output_indices.end(),
output_indices.begin(),
thrust::maximum<difference_type>());
// gather input values according to index array (output = first2[output_indices])
thrust::gather(output_indices.begin(),
output_indices.end(),
first2,
output);
// return output + output_size
thrust::advance(output, output_size);
return output;
}
/////////////////////////////////////////////////////////////////////////////////////
template<typename T>
void print_vector(T& vec) {
for (const auto& elem : vec) {
std::cout << std::setw(5) << elem;
}
std::cout << std::endl;
}
struct myOp : public thrust::unary_function<thrust::tuple<double,double,int,int,int,double>, double> {
// vals data 1/K 1%K nums crit
__host__ __device__ // 0 1 2 3 4 5
double operator() (const thrust::tuple<double,double,int,int,int, double> &t) const {
double res;
if (thrust::get<2>(t) >= thrust::get<4>(t)) {
res = thrust::get<0>(t); // do nothing
}else {
if (thrust::get<3>(t) >= thrust::get<4>(t)) {
res = thrust::get<0>(t); // do nothing
}else {
double tmp = thrust::get<0>(t);
if (thrust::get<1>(t) >= thrust::get<5>(t)) { tmp = 0.0; }
res = tmp;
}
}
return res;
}
};
__global__ void myOpKernel(double *vals, double *data, int *nums, double *crit, int N, int K) {
int index = blockIdx.x*blockDim.x + threadIdx.x;
if (index >= N) return;
double _crit = crit[index];
for (int i=0; i<nums[index]; i++) {
double _res = vals[index*K + i];
if (data[index*K + i] >= _crit) { _res = 0.0; }
vals[index*K + i] = _res;
}
}
__global__ void myOpKernel3(
double * __restrict__ vals,
const double * __restrict__ data,
const int * __restrict__ nums,
const double * __restrict__ crit,
int N,
int K
){
for(int index = blockIdx.x; index < N; index += gridDim.x){
const double _crit = crit[index];
const int num = nums[index];
for(int i = threadIdx.x; i < num; i += blockDim.x){
double _res = vals[index*K + i];
if (data[index*K + i] >= _crit) { _res = 0.0; }
vals[index*K + i] = _res;
}
}
}
__global__
void myOpKernel4(
double * __restrict__ vals,
const double * __restrict__ data,
const int * __restrict__ nums,
const double * __restrict__ crit,
const int* __restrict__ numsPrefixSum,
const int* __restrict__ indexForThread,
int totalnums,
int N,
int K
){
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int numValid = totalnums;
if(tid < numValid){
const int index = indexForThread[tid];
const int i = tid - numsPrefixSum[index];
const double _crit = crit[index];
double _res = vals[index*K + i];
if (data[index*K + i] >= _crit) { _res = 0.0; }
vals[index*K + i] = _res;
}
}
int main(int argc, char **argv) {
using namespace thrust::placeholders;
int N = atoi(argv[1]);
int K = atoi(argv[2]);
std::cout << "N " << N << " K " << K << std::endl;
thrust::host_vector<double> h_vals(N*K);
thrust::host_vector<double> h_data(N*K);
thrust::host_vector<double> h_crit(N);
thrust::host_vector<int> h_nums(N);
for (int i=0; i<N; i++) {
h_crit[i] = 101.0; // arbitrary
h_nums[i] = 4; // arbitrary number less than 256
for (int j=0; j<K; j++) {
h_vals[i*K + j] = (double)(i*K + j); // arbitrary
h_data[i*K + j] = (double)(i*K + j); // arbitrary
}
}
thrust::device_vector<double> vals = h_vals;
thrust::device_vector<double> data = h_data;
thrust::device_vector<double> crit = h_crit;
thrust::device_vector<int> nums = h_nums;
thrust::device_vector<double> res(vals.size());
cudaEvent_t eventA; cudaEventCreate(&eventA);
cudaEvent_t eventB; cudaEventCreate(&eventB);
//--- 1) thrust
cudaEventRecord(eventA);
thrust::device_vector<int> nums_expand(N*K);
thrust::device_vector<double> crit_expand(N*K);
expand(thrust::constant_iterator<int>(K),
thrust::constant_iterator<int>(K)+N,
nums.begin(),
nums_expand.begin());
expand(thrust::constant_iterator<int>(K),
thrust::constant_iterator<int>(K)+N,
crit.begin(),
crit_expand.begin());
cudaEventRecord(eventB);
cudaEventSynchronize(eventB);
float expandtime; cudaEventElapsedTime(&expandtime, eventA, eventB);
std::cout << "expandtime " << expandtime << " ms\n";
cudaEventRecord(eventA);
thrust::transform(thrust::make_zip_iterator(vals.begin(),
data.begin(),
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1/K), // for N
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1%K), // for K
nums_expand.begin(),
crit_expand.begin()),
thrust::make_zip_iterator(vals.end(),
data.end(),
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1/K) + N*K,
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), _1%K) + N*K,
nums_expand.end(),
crit_expand.end()),
res.begin(),
myOp());
cudaEventRecord(eventB);
cudaEventSynchronize(eventB);
float thrusttransformtime; cudaEventElapsedTime(&thrusttransformtime, eventA, eventB);
std::cout << "thrusttransformtime " << thrusttransformtime << " ms\n";
cudaDeviceSynchronize();
// std::cout << "vals after thrust\n";
// for(int i = 0; i < res.size(); i++){
// std::cout << res[i] << " ";
// }
// std::cout << "\n";
//--- 2) kernel
thrust::device_vector<double> vals2 = h_vals;
thrust::device_vector<double> data2 = h_data;
thrust::device_vector<double> crit2 = h_crit;
thrust::device_vector<int> nums2 = h_nums;
cudaEventRecord(eventA);
int Nthreads = 256;
int Nblocks = (N*K - 1) / Nthreads + 1;
myOpKernel<<<Nblocks,Nthreads>>>(vals2.data().get(), data2.data().get(), nums2.data().get(), crit2.data().get(), N, K);
cudaEventRecord(eventB);
cudaEventSynchronize(eventB);
float myOpKerneltime; cudaEventElapsedTime(&myOpKerneltime, eventA, eventB);
std::cout << "myOpKerneltime " << myOpKerneltime << " ms\n";
cudaDeviceSynchronize();
if(res == vals2){
std::cout << "results from thrust and myOpKernel match\n";
}else{
std::cout << "results from thrust and myOpKernel do not match\n";
}
{
//1 block per index
thrust::device_vector<double> vals_new = h_vals;
thrust::device_vector<double> data_new = h_data;
thrust::device_vector<double> crit_new = h_crit;
thrust::device_vector<int> nums_new = h_nums;
cudaEventRecord(eventA);
int Nthreads = 256;
int Nblocks = N;
myOpKernel3<<<Nblocks,Nthreads>>>(vals_new.data().get(), data_new.data().get(), nums_new.data().get(), crit_new.data().get(), N, K);
cudaEventRecord(eventB);
cudaEventSynchronize(eventB);
float myOpKerneltime3; cudaEventElapsedTime(&myOpKerneltime3, eventA, eventB);
std::cout << "myOpKerneltime3 " << myOpKerneltime3 << " ms\n";
cudaDeviceSynchronize();
assert(vals_new == vals2);
}
{
//1 thread per output position
thrust::device_vector<double> vals_new = h_vals;
thrust::device_vector<double> data_new = h_data;
thrust::device_vector<double> crit_new = h_crit;
thrust::device_vector<int> nums_new = h_nums;
cudaEventRecord(eventA);
thrust::device_vector<int> numsPrefixSum(N+1);
numsPrefixSum[0] = 0;
thrust::inclusive_scan(
nums_new.begin(),
nums_new.end(),
numsPrefixSum.begin() + 1
);
const int totalNums = numsPrefixSum.back();
thrust::device_vector<int> indexForThread(totalNums, 0);
thrust::scatter_if(
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(0) + N,
numsPrefixSum.begin(),
thrust::make_transform_iterator(
nums_new.begin(),
[] __host__ __device__ (int i){return i > 0;}
),
indexForThread.begin()
);
thrust::inclusive_scan(
indexForThread.begin(),
indexForThread.begin() + totalNums,
indexForThread.begin(),
thrust::maximum<int>{}
);
cudaEventRecord(eventB);
cudaEventSynchronize(eventB);
float myOpKernel4_setuptime; cudaEventElapsedTime(&myOpKernel4_setuptime, eventA, eventB);
std::cout << "myOpKernel4_setuptime " << myOpKernel4_setuptime << " ms\n";
cudaEventRecord(eventA);
int Nthreads = 256;
int Nblocks = (totalNums + Nthreads - 1) / Nthreads;
myOpKernel4<<<Nblocks,Nthreads>>>(
vals_new.data().get(),
data_new.data().get(),
nums_new.data().get(),
crit_new.data().get(),
numsPrefixSum.data().get(),
indexForThread.data().get(),
totalNums,
N,
K
);
cudaEventRecord(eventB);
cudaEventSynchronize(eventB);
float myOpKerneltime4; cudaEventElapsedTime(&myOpKerneltime4, eventA, eventB);
std::cout << "myOpKerneltime4 " << myOpKerneltime4 << " ms\n";
cudaDeviceSynchronize();
assert(vals_new == vals2);
}
cudaEventDestroy(eventA);
cudaEventDestroy(eventB);
return 0;
}