CUDA Thrust sort or CUB::DeviceRadixSort

I have a pool of particles represented by an array of float4 where the w component is the particle's current lifetime in the range [0, 1].

I need to sort this array based on the lifetime of the particles in descending order so that I can keep an accurate counter for how many particles are currently "active" (lifetime greater than 0). I need this counter because it will allow me to index into the correct spot in the array when I need to activate more particles (which happens randomly).

My array of particles is stored in device memory and it seems like I should be able to sort the array without having to transfer the array to host memory.

I haven't had much luck in finding examples online that show how I could do this with either Thrust or CUB. In addition, I am hesitant to use Thrust because I don't know how to prevent it from degrading into merge-sort (which is far slower than radix sort) since I need to sort based on the w component. As for CUB, I simply haven't found any resources at all as to how I could do this.

I would also prefer to keep the lifetime stored within the w component since this makes my life significantly easier in other parts of my code.

Is there a simple way to do this? Thanks for any help.

Solution

In either cub or thrust, we could sort on the .w "keys" only, doing a key-value sort where the values are just a linear incrementing index:

0, 1, 2, 3, ...

We could then use the resultant rearrangement of the index sequence to reorder the original float4 array in one step (effectively sorted by .w). This would allow you to retain the radix sorting speed (in either cub or thrust) and might also be fairly efficient since the float4 quantities only need to be moved/rearranged once, rather than successively moved during the sort operation.

Here is a fully worked example in thrust, on 32M elements, demonstrating an "ordinary" thrust sort, using a functor to specify sorting on the .w element (sort_f4_w), followed be the approach described above. In this case, on my particular setup (Fedora 20, CUDA 7, Quadro5000), the 2nd approach seems to be about 5x faster:

$ cat t686.cu
#include <iostream>
#include <vector_types.h>
#include <stdlib.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/equal.h>

#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL

unsigned long long dtime_usec(unsigned long long start){

  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}

#define DSIZE (32*1048576)

struct sort_f4_w
{
  __host__ __device__
  bool operator()(const float4 &a, const float4 &b) const {
    return (a.w < b.w);}
};
// functor to extract the .w element from a float4
struct f4_to_fw : public thrust::unary_function<float4, float>
{
  __host__ __device__
  float operator()(const float4 &a) const {
    return a.w;}
};
// functor to extract the .x element from a float4
struct f4_to_fx : public thrust::unary_function<float4, float>
{
  __host__ __device__
  float operator()(const float4 &a) const {
    return a.x;}
};


bool validate(thrust::device_vector<float4> &d1, thrust::device_vector<float4> &d2){
  return thrust::equal(thrust::make_transform_iterator(d1.begin(), f4_to_fx()), thrust::make_transform_iterator(d1.end(), f4_to_fx()), thrust::make_transform_iterator(d2.begin(), f4_to_fx()));
}


int main(){
  unsigned long long t1_time, t2_time;
  float4 *mydata = new float4[DSIZE];
  for (int i = 0; i < DSIZE; i++){
    mydata[i].x = i;
    mydata[i].y = i;
    mydata[i].z = i;
    mydata[i].w = rand()/(float)RAND_MAX;}

  thrust::host_vector<float4>   h_data(mydata, mydata+DSIZE);
  // do once as a warm-up run, then report timings on second run
  for (int i = 0; i < 2; i++){
    thrust::device_vector<float4> d_data1 = h_data;
    thrust::device_vector<float4> d_data2 = h_data;

  // first time sort using typical thrust approach
    t1_time = dtime_usec(0);
    thrust::sort(d_data1.begin(), d_data1.end(), sort_f4_w());
    cudaDeviceSynchronize();
    t1_time = dtime_usec(t1_time);
  // now extract keys and create index values, sort, then rearrange
    t2_time = dtime_usec(0);
    thrust::device_vector<float> keys(DSIZE);
    thrust::device_vector<int> vals(DSIZE);
    thrust::copy(thrust::make_transform_iterator(d_data2.begin(), f4_to_fw()), thrust::make_transform_iterator(d_data2.end(), f4_to_fw()), keys.begin());
    thrust::sequence(vals.begin(), vals.end());
    thrust::sort_by_key(keys.begin(), keys.end(), vals.begin());
    thrust::device_vector<float4> result(DSIZE);
    thrust::copy(thrust::make_permutation_iterator(d_data2.begin(), vals.begin()), thrust::make_permutation_iterator(d_data2.begin(), vals.end()), result.begin());
    cudaDeviceSynchronize();
    t2_time = dtime_usec(t2_time);
    if (!validate(d_data1, result)){
      std::cout << "Validation failure " << std::endl;
      }
    }
  std::cout << "thrust t1 time: " << t1_time/(float)USECPSEC << "s, t2 time: " << t2_time/(float)USECPSEC << std::endl;
}


$ nvcc -o t686 t686.cu
$ ./t686
thrust t1 time: 0.731456s, t2 time: 0.149959
$