duplicatescudagputhrust

Pair deduplication on CUDA


I have a data structure already running on CUDA and collect the data as below:

struct SearchDataOnDevice
{
    size_t npair;
    int * id1;
    int * id2;
};

I'd like to remove the duplicated id pair w/ and w/o an option called same_id_src, when same_id_src is true, <0, 5> and <5, 0> are duplicated and <5, 0> should be removed. when same_id_src is false, both pairs should be kept.

I am new to CUDA and the Thrust library, can anyone help with a quick hint?


Solution

  • Here is one possible approach:

    1. sort the ID pairs, to bring "like" pairs together (thrust::sort)
    2. identify which pairs to keep, based on comparing adjacent pairs. Put results in a stencil array (thrust::transform)
    3. based on the stencil array, use thrust::copy_if to perform stream compaction on the sorted pairs to produce the deduplicated result

    The need to handle the cases where e.g. <0 5> and <5 0> are considered "identical" or not, is handled via modification to the sort functor, as well as modification to the transform functor. In the sort functor case, we reorder, for comparison purposes, each pair such that the lower ID appears first in the pair. We must arrange the sort functor carefully, so that the case of <0 5> is chosen preferentially over the case of <5 0>, when the special condition is true.

    Here is an example:

    # cat t76.cu
    #include <thrust/sort.h>
    #include <thrust/copy.h>
    #include <thrust/transform.h>
    #include <thrust/iterator/zip_iterator.h>
    #include <thrust/device_ptr.h>
    #include <thrust/device_vector.h>
    #include <iostream>
    #include <cstdlib>
    
    struct my_sort_functor
    {
      bool same_id_src;
      my_sort_functor(bool _same_id_src) : same_id_src(_same_id_src) {};
      template <typename T1, typename T2>
      __host__ __device__
      bool operator()(T1 t1, T2 t2){
        int t1a = thrust::get<0>(t1);
        int t1b = thrust::get<1>(t1);
        int t2a = thrust::get<0>(t2);
        int t2b = thrust::get<1>(t2);
        if (same_id_src) {//need to possibly reorder each pair for testing
          bool t1s = (t1a > t1b);
          bool t2s = (t2a > t2b);
          // sort on smaller id first
          if ((t1s?t1b:t1a) < (t2s?t2b:t2a)) return true;
          if ((t1s?t1b:t1a) > (t2s?t2b:t2a)) return false;
          // then sort on larger id
          if ((t1s?t1a:t1b) < (t2s?t2a:t2b)) return true;
          if ((t1s?t1a:t1b) > (t2s?t2a:t2b)) return false;
          // then sort based on the equality case
          // we prefer to choose <0,5> over <5,0>
          // so order that one first
          return !t1s;}
        else { // no reordering of pairs
          // sort on first id
          if (t1a < t2a) return true;
          if (t1a > t2a) return false;
          // sort on second id
          if (t1b < t2b) return true;
          return false;}
        }
    };
    
    struct my_transform_functor
    {
      bool same_id_src;
      my_transform_functor(bool _same_id_src) : same_id_src(_same_id_src) {};
      template <typename T1, typename T2>
      __host__ __device__
      bool operator()(T1 t1, T2 t2){
        if ((thrust::get<0>(t1) == thrust::get<0>(t2)) && (thrust::get<1>(t1) == thrust::get<1>(t2))) return false;
        if (same_id_src)
          if ((thrust::get<0>(t1) == thrust::get<1>(t2)) && (thrust::get<1>(t1) == thrust::get<0>(t2))) return false;
        return true;
        }
    };
    
    struct my_copy_predicate
    {
      __host__ __device__
      bool operator()(bool t) { return t;}
    };
    
    int main(int argc, char *argv[]){
    
      // data setup
      int d1[] = {0, 1, 2, 3, 4, 5, 0};
      int d2[] = {5, 2, 1, 2, 1, 0, 5};
      bool same_id_src = true;
      if (argc > 1) same_id_src = false;
      size_t npair = sizeof(d1)/sizeof(d1[0]);
      int *id1, *id2;
      cudaMalloc(&id1, sizeof(d1));
      cudaMalloc(&id2, sizeof(d1));
      cudaMemcpy(id1, d1, sizeof(d1), cudaMemcpyHostToDevice);
      cudaMemcpy(id2, d2, sizeof(d1), cudaMemcpyHostToDevice);
      auto dp_id1 = thrust::device_ptr<int>(id1);
      auto dp_id2 = thrust::device_ptr<int>(id2);
      auto dzip = thrust::make_zip_iterator(thrust::make_tuple(dp_id1, dp_id2));
      thrust::device_vector<bool> stencil(npair, true);
      thrust::device_vector<int> r1(npair);
      thrust::device_vector<int> r2(npair);
      auto rzip = thrust::make_zip_iterator(thrust::make_tuple(r1.begin(), r2.begin()));
      // step 1: sort
      thrust::sort(dzip, dzip+npair, my_sort_functor(same_id_src));
      // step 2: mark pairs to be kept in stencil
      thrust::transform(dzip, dzip+npair-1, dzip+1,  stencil.begin()+1, my_transform_functor(same_id_src));
      // step 3: copy if, using stencil
      int rsize = thrust::copy_if(dzip, dzip+npair, stencil.begin(), rzip, my_copy_predicate()) - rzip;
      // display result
      thrust::copy_n(r1.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
      std::cout << std::endl;
      thrust::copy_n(r2.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
      std::cout << std::endl;
    }
    # nvcc -o t76 t76.cu
    # compute-sanitizer ./t76
    ========= COMPUTE-SANITIZER
    0 1 4 3
    5 2 1 2
    ========= ERROR SUMMARY: 0 errors
    # compute-sanitizer ./t76 1
    ========= COMPUTE-SANITIZER
    0 1 2 3 4 5
    5 2 1 2 1 0
    ========= ERROR SUMMARY: 0 errors
    #
    

    When we don't specify a command-line argument, the special case is considered to be true, and additional "duplicates" are removed. When we do specify a command-line argument, the special case is considered to be false.

    EDIT: Working off a comment from paleonix below, we can improve the above implementation by replacing steps 2 and 3 with a call to thrust::unique_copy, which is also a stream compaction operation. The sort process remains unchanged, and only a slight change is made to our previous transform functor, to make it usable for the unique_copy operation:

    # cat t76.cu
    #include <thrust/sort.h>
    #include <thrust/copy.h>
    #include <thrust/unique.h>
    #include <thrust/iterator/zip_iterator.h>
    #include <thrust/device_ptr.h>
    #include <thrust/device_vector.h>
    #include <iostream>
    #include <cstdlib>
    
    struct my_sort_functor
    {
      bool same_id_src;
      my_sort_functor(bool _same_id_src) : same_id_src(_same_id_src) {};
      template <typename T1, typename T2>
      __host__ __device__
      bool operator()(T1 t1, T2 t2){
        int t1a = thrust::get<0>(t1);
        int t1b = thrust::get<1>(t1);
        int t2a = thrust::get<0>(t2);
        int t2b = thrust::get<1>(t2);
        if (same_id_src) {//need to possibly reorder each pair for testing
          bool t1s = (t1a > t1b);
          bool t2s = (t2a > t2b);
          // sort on smaller id first
          if ((t1s?t1b:t1a) < (t2s?t2b:t2a)) return true;
          if ((t1s?t1b:t1a) > (t2s?t2b:t2a)) return false;
          // then sort on larger id
          if ((t1s?t1a:t1b) < (t2s?t2a:t2b)) return true;
          if ((t1s?t1a:t1b) > (t2s?t2a:t2b)) return false;
          // then sort based on the equality case
          // we prefer to choose <0,5> over <5,0>
          // so order that one first
          return !t1s;}
        else { // no reordering of pairs
          // sort on first id
          if (t1a < t2a) return true;
          if (t1a > t2a) return false;
          // sort on second id
          if (t1b < t2b) return true;
          return false;}
        }
    };
    
    struct my_transform_functor
    {
      bool same_id_src;
      my_transform_functor(bool _same_id_src) : same_id_src(_same_id_src) {};
      template <typename T1, typename T2>
      __host__ __device__
      bool operator()(T1 t1, T2 t2){
        if ((thrust::get<0>(t1) == thrust::get<0>(t2)) && (thrust::get<1>(t1) == thrust::get<1>(t2))) return true;
        if (same_id_src)
          if ((thrust::get<0>(t1) == thrust::get<1>(t2)) && (thrust::get<1>(t1) == thrust::get<0>(t2))) return true;
        return false;
        }
    };
    
    int main(int argc, char *argv[]){
    
      // data setup
      int d1[] = {0, 1, 2, 3, 4, 5, 0};
      int d2[] = {5, 2, 1, 2, 1, 0, 5};
      bool same_id_src = true;
      if (argc > 1) same_id_src = false;
      size_t npair = sizeof(d1)/sizeof(d1[0]);
      int *id1, *id2;
      cudaMalloc(&id1, sizeof(d1));
      cudaMalloc(&id2, sizeof(d1));
      cudaMemcpy(id1, d1, sizeof(d1), cudaMemcpyHostToDevice);
      cudaMemcpy(id2, d2, sizeof(d1), cudaMemcpyHostToDevice);
      auto dp_id1 = thrust::device_ptr<int>(id1);
      auto dp_id2 = thrust::device_ptr<int>(id2);
      auto dzip = thrust::make_zip_iterator(thrust::make_tuple(dp_id1, dp_id2));
      thrust::device_vector<int> r1(npair);
      thrust::device_vector<int> r2(npair);
      auto rzip = thrust::make_zip_iterator(thrust::make_tuple(r1.begin(), r2.begin()));
      // step 1: sort
      thrust::sort(dzip, dzip+npair, my_sort_functor(same_id_src));
      // step 2: copy unique elements
      int rsize = thrust::unique_copy(dzip, dzip+npair, rzip, my_transform_functor(same_id_src)) - rzip;
      // display result
      thrust::copy_n(r1.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
      std::cout << std::endl;
      thrust::copy_n(r2.begin(), rsize, std::ostream_iterator<int>(std::cout, " "));
      std::cout << std::endl;
    }
    # nvcc -o t76 t76.cu
    # ./t76
    0 1 4 3
    5 2 1 2
    # ./t76 1
    0 1 2 3 4 5
    5 2 1 2 1 0
    #