I have a data structure already running on CUDA and collect the data as below:
struct SearchDataOnDevice
{
size_t npair;
int * id1;
int * id2;
};
I'd like to remove the duplicated id pair w/ and w/o an option called same_id_src
, when same_id_src
is true
, <0, 5>
and <5, 0>
are duplicated and <5, 0>
should be removed. when same_id_src
is false
, both pairs should be kept.
I am new to CUDA and the Thrust library, can anyone help with a quick hint?
Here is one possible approach:
thrust::sort
)thrust::transform
)thrust::copy_if
to perform stream compaction on the sorted pairs to produce the deduplicated resultThe need to handle the cases where e.g. <0 5>
and <5 0>
are considered "identical" or not, is handled via modification to the sort functor, as well as modification to the transform functor. In the sort functor case, we reorder, for comparison purposes, each pair such that the lower ID appears first in the pair. We must arrange the sort functor carefully, so that the case of <0 5>
is chosen preferentially over the case of <5 0>
, when the special condition is true.
Here is an example:
# cat t76.cu
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <thrust/transform.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <cstdlib>
struct my_sort_functor
{
bool same_id_src;
my_sort_functor(bool _same_id_src) : same_id_src(_same_id_src) {};
template <typename T1, typename T2>
__host__ __device__
bool operator()(T1 t1, T2 t2){
int t1a = thrust::get<0>(t1);
int t1b = thrust::get<1>(t1);
int t2a = thrust::get<0>(t2);
int t2b = thrust::get<1>(t2);
if (same_id_src) {//need to possibly reorder each pair for testing
bool t1s = (t1a > t1b);
bool t2s = (t2a > t2b);
// sort on smaller id first
if ((t1s?t1b:t1a) < (t2s?t2b:t2a)) return true;
if ((t1s?t1b:t1a) > (t2s?t2b:t2a)) return false;
// then sort on larger id
if ((t1s?t1a:t1b) < (t2s?t2a:t2b)) return true;
if ((t1s?t1a:t1b) > (t2s?t2a:t2b)) return false;
// then sort based on the equality case
// we prefer to choose <0,5> over <5,0>
// so order that one first
return !t1s;}
else { // no reordering of pairs
// sort on first id
if (t1a < t2a) return true;
if (t1a > t2a) return false;
// sort on second id
if (t1b < t2b) return true;
return false;}
}
};
struct my_transform_functor
{
bool same_id_src;
my_transform_functor(bool _same_id_src) : same_id_src(_same_id_src) {};
template <typename T1, typename T2>
__host__ __device__
bool operator()(T1 t1, T2 t2){
if ((thrust::get<0>(t1) == thrust::get<0>(t2)) && (thrust::get<1>(t1) == thrust::get<1>(t2))) return false;
if (same_id_src)
if ((thrust::get<0>(t1) == thrust::get<1>(t2)) && (thrust::get<1>(t1) == thrust::get<0>(t2))) return false;
return true;
}
};
struct my_copy_predicate
{
__host__ __device__
bool operator()(bool t) { return t;}
};
int main(int argc, char *argv[]){
// data setup
int d1[] = {0, 1, 2, 3, 4, 5, 0};
int d2[] = {5, 2, 1, 2, 1, 0, 5};
bool same_id_src = true;
if (argc > 1) same_id_src = false;
size_t npair = sizeof(d1)/sizeof(d1[0]);
int *id1, *id2;
cudaMalloc(&id1, sizeof(d1));
cudaMalloc(&id2, sizeof(d1));
cudaMemcpy(id1, d1, sizeof(d1), cudaMemcpyHostToDevice);
cudaMemcpy(id2, d2, sizeof(d1), cudaMemcpyHostToDevice);
auto dp_id1 = thrust::device_ptr<int>(id1);
auto dp_id2 = thrust::device_ptr<int>(id2);
auto dzip = thrust::make_zip_iterator(thrust::make_tuple(dp_id1, dp_id2));
thrust::device_vector<bool> stencil(npair, true);
thrust::device_vector<int> r1(npair);
thrust::device_vector<int> r2(npair);
auto rzip = thrust::make_zip_iterator(thrust::make_tuple(r1.begin(), r2.begin()));
// step 1: sort
thrust::sort(dzip, dzip+npair, my_sort_functor(same_id_src));
// step 2: mark pairs to be kept in stencil
thrust::transform(dzip, dzip+npair-1, dzip+1, stencil.begin()+1, my_transform_functor(same_id_src));
// step 3: copy if, using stencil
int rsize = thrust::copy_if(dzip, dzip+npair, stencil.begin(), rzip, my_copy_predicate()) - rzip;
// display result
thrust::copy_n(r1.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
thrust::copy_n(r2.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
}
# nvcc -o t76 t76.cu
# compute-sanitizer ./t76
========= COMPUTE-SANITIZER
0 1 4 3
5 2 1 2
========= ERROR SUMMARY: 0 errors
# compute-sanitizer ./t76 1
========= COMPUTE-SANITIZER
0 1 2 3 4 5
5 2 1 2 1 0
========= ERROR SUMMARY: 0 errors
#
When we don't specify a command-line argument, the special case is considered to be true, and additional "duplicates" are removed. When we do specify a command-line argument, the special case is considered to be false.
EDIT: Working off a comment from paleonix below, we can improve the above implementation by replacing steps 2 and 3 with a call to thrust::unique_copy
, which is also a stream compaction operation. The sort process remains unchanged, and only a slight change is made to our previous transform functor, to make it usable for the unique_copy
operation:
# cat t76.cu
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <thrust/unique.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <cstdlib>
struct my_sort_functor
{
bool same_id_src;
my_sort_functor(bool _same_id_src) : same_id_src(_same_id_src) {};
template <typename T1, typename T2>
__host__ __device__
bool operator()(T1 t1, T2 t2){
int t1a = thrust::get<0>(t1);
int t1b = thrust::get<1>(t1);
int t2a = thrust::get<0>(t2);
int t2b = thrust::get<1>(t2);
if (same_id_src) {//need to possibly reorder each pair for testing
bool t1s = (t1a > t1b);
bool t2s = (t2a > t2b);
// sort on smaller id first
if ((t1s?t1b:t1a) < (t2s?t2b:t2a)) return true;
if ((t1s?t1b:t1a) > (t2s?t2b:t2a)) return false;
// then sort on larger id
if ((t1s?t1a:t1b) < (t2s?t2a:t2b)) return true;
if ((t1s?t1a:t1b) > (t2s?t2a:t2b)) return false;
// then sort based on the equality case
// we prefer to choose <0,5> over <5,0>
// so order that one first
return !t1s;}
else { // no reordering of pairs
// sort on first id
if (t1a < t2a) return true;
if (t1a > t2a) return false;
// sort on second id
if (t1b < t2b) return true;
return false;}
}
};
struct my_transform_functor
{
bool same_id_src;
my_transform_functor(bool _same_id_src) : same_id_src(_same_id_src) {};
template <typename T1, typename T2>
__host__ __device__
bool operator()(T1 t1, T2 t2){
if ((thrust::get<0>(t1) == thrust::get<0>(t2)) && (thrust::get<1>(t1) == thrust::get<1>(t2))) return true;
if (same_id_src)
if ((thrust::get<0>(t1) == thrust::get<1>(t2)) && (thrust::get<1>(t1) == thrust::get<0>(t2))) return true;
return false;
}
};
int main(int argc, char *argv[]){
// data setup
int d1[] = {0, 1, 2, 3, 4, 5, 0};
int d2[] = {5, 2, 1, 2, 1, 0, 5};
bool same_id_src = true;
if (argc > 1) same_id_src = false;
size_t npair = sizeof(d1)/sizeof(d1[0]);
int *id1, *id2;
cudaMalloc(&id1, sizeof(d1));
cudaMalloc(&id2, sizeof(d1));
cudaMemcpy(id1, d1, sizeof(d1), cudaMemcpyHostToDevice);
cudaMemcpy(id2, d2, sizeof(d1), cudaMemcpyHostToDevice);
auto dp_id1 = thrust::device_ptr<int>(id1);
auto dp_id2 = thrust::device_ptr<int>(id2);
auto dzip = thrust::make_zip_iterator(thrust::make_tuple(dp_id1, dp_id2));
thrust::device_vector<int> r1(npair);
thrust::device_vector<int> r2(npair);
auto rzip = thrust::make_zip_iterator(thrust::make_tuple(r1.begin(), r2.begin()));
// step 1: sort
thrust::sort(dzip, dzip+npair, my_sort_functor(same_id_src));
// step 2: copy unique elements
int rsize = thrust::unique_copy(dzip, dzip+npair, rzip, my_transform_functor(same_id_src)) - rzip;
// display result
thrust::copy_n(r1.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
thrust::copy_n(r2.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
}
# nvcc -o t76 t76.cu
# ./t76
0 1 4 3
5 2 1 2
# ./t76 1
0 1 2 3 4 5
5 2 1 2 1 0
#