I have two operations for manipulating elements in device vectors using CUDA Thrust. Which methods can implement these two functions more efficiently?
Replace part of values of a vector in batch with the values from another vector. Example is shown below:
arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
arr2 = [1, 1, 1, 2, 2, 2]
// After replacing {4, 5, 6} and {10, 11, 12} in batch = 3:
arr1 = [1, 2, 3, 1, 1, 1, 7, 8, 9, 2, 2, 2]
In my case, I always have size(arr1) / size(arr2) = 2
.
We replace the values in arr1
from the index 1 * batch
and 3 * batch
.
In most cases, I need to replace items in odd batches. The case for general batches is also needed.
Merge two vectors alternating indexes.
A same question is raised in How to merge 2 vectors alternating indexes?, but is for R
language.
arr1 = [1, 2, 3, 4, 5, 6]
arr2 = [1, 1, 1, 2, 2, 2]
//After merging arr1 and arr2:
arr3 = [1, 2, 3, 1, 1, 1, 4, 5, 6, 2, 2, 2]
replace_copy_if
may work, but I don't know how to combine it with fancy iterators. Additionally, some blogs show that replace_copy_if
is slower than copy_if
.
This operation scatters the values in arr2
into arr1
, so let's use thrust::scatter
. The indices to which the values are scattered can be calculated with a thrust::transform_iterator
from a thrust::counting_iterator
(like std::ranges::views::iota
). For the general case where the batch indices
are given as another input thrust::device_vector
, you can use
const auto indices_ptr = indices.data();
const auto scatter_indices_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch, indices_ptr]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
const int elem_id = idx % batch;
return indices_ptr[batch_id] * batch + elem_id;
});
while in the specific case where you just want to scatter to the odd batches, you should rather use
const auto scatter_indices_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
const int elem_id = idx % batch;
return (batch_id * 2 + 1) * batch + elem_id;
});
The scatter
operation itself is easy:
thrust::scatter(
arr2.cbegin(), arr2.cend(),
scatter_indices_it,
arr1.begin());
There are certainly multiple possible ways of doing this. The one that came to mind first for me was merging the two vectors with thrust::merge_by_key
, where the keys are generated using a similar scheme as above for the scatter indices:
const auto merge_keys1_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
return batch_id * 2;
});
const auto merge_keys2_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
return batch_id * 2 + 1;
});
thrust::merge_by_key(
merge_keys1_it, merge_keys1_it + arr1.size(),
merge_keys2_it, merge_keys2_it + arr2.size(),
arr1.cbegin(),
arr2.cbegin(),
thrust::make_discard_iterator(),
arr3.begin());
This works and is relatively elegant, but probably not ideal for performance due to the complexity of the merge algorithm and the regularity of the operation. A (probably) more performant ansatz would be creating a fancy iterator that takes care of the whole interleaving operation:
const auto arr1_ptr = arr1.data();
const auto arr2_ptr = arr2.data();
const auto batch_interleave_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch, arr1_ptr, arr2_ptr]
__host__ __device__ (int idx) -> int {
const int batch_id = idx / batch;
const int batch_el = idx % batch;
const int in_idx = (batch_id / 2) * batch + batch_el;
const bool even_batch = (batch_id % 2 == 0);
return even_batch ? arr1_ptr[in_idx] : arr2_ptr[in_idx];
});
This iterator can now be fed to the next algorithm in your pipeline for kernel fusion or used to initialize a new vector using the constructor which takes an begin
and an end
iterator. If the interleaved arr3
is used (read from) multiple times in future it, you should probably put it into a new vector instead of reusing the iterator as the iterator doesn't allow for coalesced global memory access if batch
isn't a multiple of warp size (32).
Due to the use of device lambdas, nvcc
needs -extended-lambda
. Since CUDA 12 it also needs -rdc=true
for some reason.
#include <iostream>
#include <iterator>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/scatter.h>
#include <thrust/merge.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/discard_iterator.h>
template <typename T>
void print(const thrust::device_vector<T> &vec) {
thrust::host_vector<int> h_vec(vec);
std::ostream_iterator<int> out_it(std::cout, ", ");
thrust::copy(h_vec.cbegin(), h_vec.cend(),
out_it);
std::cout << '\n';
}
void part1() {
constexpr int h_arr1[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
constexpr int h_arr2[] = {1, 1, 1, 2, 2, 2};
constexpr int batch = 3;
thrust::device_vector<int> arr1(std::cbegin(h_arr1), std::cend(h_arr1));
thrust::device_vector<int> arr2(std::cbegin(h_arr2), std::cend(h_arr2));
assert(arr1.size() == 2 * arr2.size());
#if 1
// specialized version where arr2 is scattered to "uneven"
// batches
const auto scatter_indices_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
const int elem_id = idx % batch;
return (batch_id * 2 + 1) * batch + elem_id;
});
#else
// more general version where arr2 is scattered to batches given
// in indices
constexpr int h_indices[] = {1, 3};
thrust::device_vector<int> indices(
std::cbegin(h_indices), std::cend(h_indices));
const auto indices_ptr = indices.data();
const auto scatter_indices_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch, indices_ptr]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
const int elem_id = idx % batch;
return indices_ptr[batch_id] * batch + elem_id;
});
#endif
thrust::scatter(
arr2.cbegin(), arr2.cend(),
scatter_indices_it,
arr1.begin());
print(arr1);
}
void part2() {
constexpr int h_arr1[] = {1, 2, 3, 4, 5, 6};
constexpr int h_arr2[] = {1, 1, 1, 2, 2, 2};
constexpr int batch = 3;
thrust::device_vector<int> arr1(std::cbegin(h_arr1), std::cend(h_arr1));
thrust::device_vector<int> arr2(std::cbegin(h_arr2), std::cend(h_arr2));
const auto out_size = arr1.size() + arr2.size();
assert(arr1.size() == arr2.size());
#if 1
const auto arr1_ptr = arr1.data();
const auto arr2_ptr = arr2.data();
const auto batch_interleave_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch, arr1_ptr, arr2_ptr]
__host__ __device__ (int idx) -> int {
const int batch_id = idx / batch;
const int batch_el = idx % batch;
const int in_idx = (batch_id / 2) * batch + batch_el;
const bool even_batch = (batch_id % 2 == 0);
return even_batch ? arr1_ptr[in_idx] : arr2_ptr[in_idx];
});
// only put into vector if really needed, better to feed the
// iterator directly to the next operation for kernel fusion
thrust::device_vector<int> arr3(
batch_interleave_it, batch_interleave_it + out_size);
#else
thrust::device_vector<int> arr3(out_size);
const auto merge_keys1_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
return batch_id * 2;
});
const auto merge_keys2_it = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[batch]
__host__ __device__ (int idx) {
const int batch_id = idx / batch;
return batch_id * 2 + 1;
});
thrust::merge_by_key(
merge_keys1_it, merge_keys1_it + arr1.size(),
merge_keys2_it, merge_keys2_it + arr2.size(),
arr1.cbegin(),
arr2.cbegin(),
thrust::make_discard_iterator(),
arr3.begin());
#endif
print(arr3);
}
int main(void) {
part1();
part2();
}