My project contains many fill
, copy
and other basic operations.
However, I'm new to CUDA programming, my current implementation just uses a for
loop to operate on device_vector
which is far less efficient than using iterators.
My question is: how to use iterators (e.g., counting
/permutation_iterator
, etc.) to implement the below functions?
fill
, sequence
and transform
values from a specified index in batch.
A toy example:
len1 = 3, len2 = 5;
vecA.size() = (len1 + 2) * len2;
vecB.size() = len2;
// {} are just to help show the difference
// fill vecA in batch (len1) from index 1 using each value of vecB, vecA` is original vector
vecB = [ 1, 5, 2, 4, 2 ]
vecA`= [1, {1, 1, 1}, 1, 1, {1, 1, 1}, 1, 1, {1, 1, 1}, 1, 1, {1, 1, 1}, 1, 1, {1, 1, 1}, 1]
vecA = [1, {1, 1, 1}, 1, 1, {5, 5, 5}, 1, 1, {2, 2, 2}, 1, 1, {4, 4, 4}, 1, 1, {2, 2, 2}, 1]
// multiply values in vecA with 2 in batch (len1) from index 1, vecC` is original vector
vecC.size() = (len1 + 2) * len2;
vecC`= [1, {1, 1, 1}, 1, 1, {2, 2, 2}, 1, 1, {3, 3, 3}, 1, 1, {4, 4, 4}, 1, 1, {5, 5, 5}, 1]
vecC = [1, {2, 2, 2}, 1, 1, {4, 4, 4}, 1, 1, {6, 6, 6}, 1, 1, {8, 8, 8}, 1, 1, {10, 10, 10}, 1]
// sequence vecD(len1 * len2) in batch (len1)
vecD = [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]
The following code uses a for
loop, I guess it's far less efficient than it could be:
size_t len1 = 3, len2 = 5;
thrust::device_vector<int> vecA((len1 +2) * len2);
thrust::device_vector<int> vecC((len1 +2) * len2);
thrust::device_vector<int> vecB(len2);
int offset_sta = 0;
int offset_end = 0;
for (size_t i = 0; i < len2; i++)
{
offset1 = i * (len1 + 2) + 1; // +1 means from specified index 1 in each batch
thrust::fill_n(vecA.begin() + offset1, len1, vecB.begin()[i]);
thrust::transform(vecC.begin() + offset1, vecC.begin() + offset1 + len1, vecC.begin() + offset1, scalar_mult_functor<int>(2));
}
// sequence
thrust::device_vector<int> vecD(len1 * len2);
for (size_t i = 0; i < len2; i++)
{
offset1 = i * len1;
offset2 = (i + 1) * len1;
thrust::sequence(vecD.begin() + offset1, vecD.begin() + offset2);
}
copy
sub-vector to another vector in batch.
A toy example:
len1 = 2, len2 = 4, len3 = 5;
// copy values of vecA(len1 * len3) in batch (len1) to vecB(len2 * len3)
vecA = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ]
vecB = [1, 2, 1, 2, 3, 4, 3, 4, 5, 6, 5, 6, 7, 8, 7, 8, 9, 10, 9, 10]
To implement this, I simply use two for
loops to copy values, but obviously that is inefficient.
reduce_sum
the values of a vector in batch (len1
) (not one value by one value).
A toy example:
len1 = 4, len2 = 3;
vecA.size() = len1 * len2;
vecB.size() = len1
vecA = [1, 2, 3, 4, 1, 1, 1, 1, 2, 2, 2, 2]
vecB = {1, 2, 3, 4} + {1, 1, 1, 1} + {2, 2, 2, 2} = [4, 5, 6, 7]
The above operations are more like that performed on 2D vectors using STL vectors on CPU. I checked some tutorials and tried to implement them with iterators, but get the wrong results.
But I have another question: Doing reduce sum for the batch in vecB
(see the below detailed example), it's like adding a new for
loop in this operation (double for
loop).
Which method is better? how should we modify the iterator code?
Modified based on previous toy example:
len1 = 4, len2 = 3; len3 = 2;
vecA.size() = len1 * len2 * len3;
vecB.size() = len1 * len3;
vecA = [1, 2, 3, 4, 1, 1, 1, 1, 2, 2, 2, 2, 3, 2, 1, 0, 1, 1, 1, 1, 2, 2, 2, 2]
vecB = [{1, 2, 3, 4} + {1, 1, 1, 1} + {2, 2, 2, 2}, {3, 2, 1, 0} + {1, 1, 1, 1} + {2, 2, 2, 2}]
= [4, 5, 6, 7, 6, 5, 4, 3]
Different from the previous example, in this case, we do two reduce sum in vecB
in the batch len3 = 2
.
1st Update
Updated Comparison between Simple for loop and Thrust iterators
I compared the performance gaps between my simple for
loop and the method (below) provided by @paleonix .
This comparison may help beginners understand the difference or make a choice when faced with the same needs.
This test is conducted on a server with Tesla V100 GPUs (we aim to explore the gaps, so the machine is not important).
Note:
len1
, len2
, etc..) (too many combinations...). I will do it later if I have time.fancy iterators
and for_each_n
, it may need a larger data size to compare them.From this comparison, we can see:
for
loop"-method is unsurprisingly slower than Thrust iterators. Particularly, its performance drops significantly when the data size becomes larger.Reduce
, for_each_n
is slower than reduce_by_key_batch
.2nd Update
Updated Comparison between reduce using different iterators when len1
is large while len2
is small
Recall that @paleonix mention the result is different:
For the special case that len1 is very big (enough parallelism for a modern GPU) which probably means that len2 is rather small, one could instead go with another for_each based, kernel-like solution that just sums up values in parallel and which does coalesced accesses:
I did some experiments and the result of the rough comparison is shown in the table:
The results demonstrated @paleonix's comments.
3rd Update
Updated Comparison between reduce sum using iterators and for_each, when len1 * len3
is large
In this case, reduce sum is performed in batch (i.e., len3
) in vecB
.
All statements about performance in this answer are assuming data sizes big enough to fully make use of a modern GPU and Thrust using the default CUDA
device backend/system (vs OMP
or TBB
).
For your examples where the batches are padded by 1
s, one could just use fancy iterators with thrust::transform
, but this is probably a bad idea due to alignment issues and code complexity. Instead one can use thrust::transform_if
where the "stencil" is given by a simple counting iterator and all the logic is put into the predicate functor. To get a batched fill, one then still needs another, more complicated fancy iterator to read from the input vector (vecB
).
const auto batched_indices_it =
thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[len1_padded]
__host__ __device__ (int idx) {
return (idx - 1) / len1_padded; // corresponds to i
});
const auto batched_fill_it =
thrust::make_permutation_iterator(
vecB.cbegin(),
batched_indices_it);
const auto is_valid =
[len1, len1_padded]
__host__ __device__ (int idx) {
return (idx != 0) && ((idx - 1) % len1_padded < len1);
};
// copy_if doesn't work here as it "compactifies" the output
// so use transform_if with identity operation
thrust::transform_if(
batched_fill_it, batched_fill_it + vecA.size(),
thrust::make_counting_iterator(0),
vecA.begin(),
thrust::identity<int>{},
is_valid);
thrust::transform_if(
vecC.cbegin(), vecC.cend(),
thrust::make_counting_iterator(0),
vecC.begin(),
scalar_mult_functor<int>{2},
is_valid);
Alternatively one might just use a thrust::for_each
without any complicated fancy iterators, which might be easier to read/understand and should be comparable/the same in performance for these trivially parallel operations (due to the kernel fusion achieved in the sample code below, it could be faster than the two transforms):
const auto A_ptr = vecA.data();
const auto B_ptr = vecB.data();
const auto C_ptr = vecC.data();
const auto op = scalar_mult_functor<int>(2);
thrust::for_each_n(
thrust::make_counting_iterator(0), vecA.size(),
[len1, len1_padded, A_ptr, B_ptr, C_ptr, op]
__host__ __device__ (int idx) {
if ((idx != 0) && ((idx - 1) % len1_padded < len1)) {
A_ptr[idx] = B_ptr[(idx - 1) / len1_padded];
C_ptr[idx] = op(C_ptr[idx]);
}
});
The batched sequence is relatively straightforward to implement as fancy iterator which should not even be copied to memory, but instead just used inside the next operation needing it as input (kernel fusion):
const auto batched_sequence_it =
thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[len1] __device__ (const int idx) { return idx % len1; });
The batched repetition is a bit more complicated but has the same basic form as the fancy iterator used for the batched fill:
const auto batched_transpose_idx_it =
thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[len1, len2] __host__ __device__ (const int idx) {
const int batch_id = idx / len2;
const int batch_el = idx % len1;
return batch_id * len1 + batch_el;
});
const auto batched_A_transpose_it =
thrust::make_permutation_iterator(
vecA.cbegin(),
batched_transpose_idx_it);
It is also ready to just be lazily evaluated inside some upcoming operation.
This one is the hardest one, at least in terms of getting good performance. The following implementation using thrust::reduce_by_key
is generally applicable but probably not very fast due to the permutation iterator messing up coalescing which is very important for performance in bandwidth-bound kernels.
const auto transpose_idx_it =
thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[len1, len2] __host__ __device__ (int idx) { // "transpose"
const int batch_id = idx / len2;
const int batch_el = idx % len2;
return batch_el * len1 + batch_id;
});
const auto A_transpose_it =
thrust::make_permutation_iterator(
vecA.cbegin(),
transpose_idx_it);
thrust::reduce_by_key(
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(len1 * len2),
A_transpose_it,
thrust::make_discard_iterator(),
vecB.begin(),
[len2] __host__ __device__ (int idx, int idy) {
return idx / len2 == idy / len2;
});
The A_transpose_it
iterator permutes the elements in vecA
such that elements that should be summed appear to be next to each other to the reduction algorithm using it. Therefore the predicate functor/lambda comparing "keys" (counting iterator) looks just like that were the case.
I don't think that a general, performant reduction for this problem is achievable with the current algorithms in Thrust. You might have more luck with libraries that are actually written to accommodate multi-dimensional data like MatX (C++ API) or cuTENSOR (C API).
For the special case that len1
is very big (enough parallelism for a modern GPU) which probably means that len2
is rather small, one could instead go with another for_each
based, kernel-like solution that just sums up values in parallel and which does coalesced accesses:
const auto A_ptr = vecA.data();
const auto B_ptr = vecB.data();
thrust::for_each_n(
thrust::make_counting_iterator(0), len1,
[len1, len2, A_ptr, B_ptr]
__host__ __device__ (int idx) {
int sum = 0;
// "grid-stride loop"
for (int i = idx; i < len1 * len2; i += len1) {
sum += A_ptr[i];
}
B_ptr[idx] = sum;
});
Seeing OP's benchmark results, which are even more clear than expected by me (10,000 is not particularly big in the GPU world) I would use the same coalescing for_each
strategy as for the previous reduction. In this case the assumption is that len1 * len3
is very big, but ideally len1
is also a multiple of 32 for coalescing. Even if that is not the case, it might still be much better than a segmented reduction (reduce_by_key
) due to general complexity/overhead.
const auto A_ptr = vecA.data();
const auto B_ptr = vecB.data();
thrust::for_each_n(
thrust::make_counting_iterator(0), len1 * len3,
[len1, len2, A_ptr, B_ptr]
__host__ __device__ (int idx) {
const int outer_batch_id = idx / len1;
const int batch_el = idx % len1;
const int begin = outer_batch_id * len1 * len2 + batch_el;
const int end = begin + len1 * len2;
int sum = 0;
for (int i = begin; i < end; i += len1) {
sum += A_ptr[i];
}
B_ptr[idx] = sum;
});
For completeness I have also added a general version using reduce_by_key
in the full code below.
Compiled with nvcc -extended-lambda -std=c++17
(with CUDA 12 I needed -rdc=True
for some reason).
#include <cassert>
#include <iostream>
#include <iterator>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/functional.h>
#include <thrust/copy.h>
#include <thrust/transform.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#include <thrust/reduce.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/discard_iterator.h>
template <typename T>
class scalar_mult_functor {
T scalar_;
public:
scalar_mult_functor(T scalar) : scalar_{scalar} {}
__host__ __device__ T operator()(T in) const {
return scalar_ * in;
}
};
void part1() {
const int len1 = 3, len2 = 5;
const int len1_padded = len1 + 2;
thrust::device_vector<int> vecA(len1_padded * len2, 1);
thrust::device_vector<int> vecB(
thrust::make_counting_iterator(2),
thrust::make_counting_iterator(len2 + 2));
thrust::device_vector<int> vecC(len1_padded * len2, 1);
// not sure if compiler is able to merge "/" and "%" from
// different functors
const auto batched_indices_it =
thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[len1_padded]
__host__ __device__ (int idx) {
return (idx - 1) / len1_padded; // corresponds to i
});
const auto batched_fill_it =
thrust::make_permutation_iterator(
vecB.cbegin(),
batched_indices_it);
const auto is_valid =
[len1, len1_padded]
__host__ __device__ (int idx) {
return (idx != 0) && ((idx - 1) % len1_padded < len1);
};
// copy_if doesn't work here as it "compactifies" the output
// so use transform_if with identity operation
thrust::transform_if(
batched_fill_it, batched_fill_it + vecA.size(),
thrust::make_counting_iterator(0),
vecA.begin(),
thrust::identity<int>{},
is_valid);
thrust::transform_if(
vecC.cbegin(), vecC.cend(),
thrust::make_counting_iterator(0),
vecC.begin(),
scalar_mult_functor<int>{2},
is_valid);
#if 0
// can't use device_vector in device code
const auto A_ptr = vecA.data();
const auto B_ptr = vecB.data();
const auto C_ptr = vecC.data();
const auto op = scalar_mult_functor<int>(2);
// probably the easiest to read, no fancy iterators
// fused both fill and transform
thrust::for_each_n(
thrust::make_counting_iterator(0), vecA.size(),
[len1, len1_padded, A_ptr, B_ptr, C_ptr, op]
__host__ __device__ (int idx) {
if ((idx != 0) && ((idx - 1) % len1_padded < len1)) {
A_ptr[idx] = B_ptr[(idx - 1) / len1_padded];
C_ptr[idx] = op(C_ptr[idx]);
}
});
#endif
// sequence
thrust::device_vector<int> vecD(len1 * len2);
auto batched_sequence_it =
thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[len1] __host__ __device__ (const int idx) { return idx % len1; });
// I wouldn't acually do this copy in real code, as you could just use
// batched_sequence_in in following operations instead of writing
// the values to memory (and reading it again later)
thrust::copy_n(batched_sequence_it, vecD.size(), vecD.begin());
// device2host transfer and printing
thrust::host_vector<int> hA(vecA);
thrust::host_vector<int> hC(vecC);
thrust::host_vector<int> hD(vecD);
auto out = std::ostream_iterator<int>(std::cout, ", ");
thrust::copy(hA.cbegin(), hA.cend(), out); std::cout << '\n';
thrust::copy(hC.cbegin(), hC.cend(), out); std::cout << '\n';
thrust::copy(hD.cbegin(), hD.cend(), out); std::cout << '\n';
}
void part2() {
const int len1 = 2, len2 = 4, len3 = 5;
thrust::device_vector<int> vecA(
thrust::make_counting_iterator(1),
thrust::make_counting_iterator(1 + len1 * len3));
const auto batched_transpose_idx_it =
thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[len1, len2] __host__ __device__ (const int idx) {
const int batch_id = idx / len2;
const int batch_el = idx % len1;
return batch_id * len1 + batch_el;
});
const auto batched_A_transpose_it =
thrust::make_permutation_iterator(
vecA.cbegin(),
batched_transpose_idx_it);
// Again, if used only once, one might not want to write this to
// memory at all, but if one does, one can avoid the unnecessary
// initialization of vecB to 0, by passing the fancy iterator to
// the constructor
thrust::device_vector<int> vecB(
batched_A_transpose_it,
batched_A_transpose_it + len2 * len3);
// device2host transfer and printing
thrust::host_vector<int> hB(vecB);
auto out = std::ostream_iterator<int>(std::cout, ", ");
thrust::copy(hB.cbegin(), hB.cend(), out); std::cout << '\n';
}
void part3() {
constexpr int A[] = {1, 2, 3, 4, 1, 1, 1, 1, 2, 2, 2, 2};
thrust::host_vector<int> hA(A, A + sizeof(A) / sizeof(A[0]));
const int len1 = 4, len2 = 3;
assert(hA.size() == len1 * len2);
thrust::device_vector<int> vecA(hA);
thrust::device_vector<int> vecB(len1);
// Due to this permutation iterator accesses to A are not aligned
// which is bad for performance, but there is not much we can do
// about that when len1 isn't big other than using a different
// data layout from the start
const auto transpose_idx_it =
thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[len1, len2] __host__ __device__ (int idx) { // "transpose"
const int batch_id = idx / len2;
const int batch_el = idx % len2;
return batch_el * len1 + batch_id;
});
const auto A_transpose_it =
thrust::make_permutation_iterator(
vecA.cbegin(),
transpose_idx_it);
thrust::reduce_by_key(
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(len1 * len2),
A_transpose_it,
thrust::make_discard_iterator(),
vecB.begin(),
[len2] __host__ __device__ (int idx, int idy) {
return idx / len2 == idy / len2;
});
#if 0
const auto A_ptr = vecA.data();
const auto B_ptr = vecB.data();
// This sequential summation actually coalesces well
// But only efficient for very big len1
thrust::for_each_n(
thrust::make_counting_iterator(0), len1,
[len1, len2, A_ptr, B_ptr]
__host__ __device__ (int idx) {
int sum = 0;
// "grid-stride loop"
for (int i = idx; i < len1 * len2; i += len1) {
sum += A_ptr[i];
}
B_ptr[idx] = sum;
});
#endif
// device2host transfer and printing
thrust::host_vector<int> hB(vecB);
auto out = std::ostream_iterator<int>(std::cout, ", ");
thrust::copy(hB.cbegin(), hB.cend(), out); std::cout << '\n';
}
void part4() {
constexpr int A[] = {1, 2, 3, 4, 1, 1, 1, 1, 2, 2, 2, 2,
3, 2, 1, 0, 1, 1, 1, 1, 2, 2, 2, 2};
thrust::host_vector<int> hA(A, A + sizeof(A) / sizeof(A[0]));
const int len1 = 4, len2 = 3, len3 = 2;
assert(hA.size() == len1 * len2 * len3);
thrust::device_vector<int> vecA(hA);
thrust::device_vector<int> vecB(len1 * len3);
#if 0
// Due to this permutation iterator accesses to A are not aligned
// which is bad for performance, but there is not much we can do
// about that when len1 isn't big other than using a different
// data layout from the start
const auto transpose_idx_it =
thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[len1, len2] __host__ __device__ (int idx) { // "transpose"
const int inner_len = len1 * len2;
const int outer_batch_id = idx / inner_len;
const int inner_idx = idx % inner_len;
const int inner_batch_id = inner_idx / len2;
const int batch_el = inner_idx % len2;
return (outer_batch_id * len2 + batch_el) * len1 + inner_batch_id;
});
const auto A_transpose_it =
thrust::make_permutation_iterator(
vecA.cbegin(),
transpose_idx_it);
thrust::reduce_by_key(
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(len1 * len2 * len3),
A_transpose_it,
thrust::make_discard_iterator(),
vecB.begin(),
[len2] __host__ __device__ (int idx, int idy) {
return idx / len2 == idy / len2;
});
#endif
const auto A_ptr = vecA.data();
const auto B_ptr = vecB.data();
// This sequential summation actually coalesces well, if len1 is a
// multiple of 32 (warp size)
// But only efficient for very big len1 * len3
thrust::for_each_n(
thrust::make_counting_iterator(0), len1 * len3,
[len1, len2, A_ptr, B_ptr]
__host__ __device__ (int idx) {
const int outer_batch_id = idx / len1;
const int batch_el = idx % len1;
const int begin = outer_batch_id * len1 * len2 + batch_el;
const int end = begin + len1 * len2;
int sum = 0;
for (int i = begin; i < end; i += len1) {
sum += A_ptr[i];
}
B_ptr[idx] = sum;
});
// device2host transfer and printing
thrust::host_vector<int> hB(vecB);
auto out = std::ostream_iterator<int>(std::cout, ", ");
thrust::copy(hB.cbegin(), hB.cend(), out); std::cout << '\n';
}
int main() {
part1();
part2();
part3();
part4();
}