Fusing two reduction operations in cuda Thrust

Is there a way to do a reduce_by_key operation and a reduce (or ideally another reduce_by_key) operation in only one kernel call in Thrust? Besides gaining computational speed, let us say I want to do this because the number of output values from the first reduce_by_key operation is too large to be stored in memory.

I have been wondering if transform_output_iterator could help here but have not found a solution.

A simple demonstration, but not my real use case, could be to find the minimum of the maximums of each row in a matrix, where that matrix is flattened and stored in a device_vector.

Solution

The following code computes the minimum of all row maximums with a fixed amount of temporary storage to store a limited number of minima. Afterwards, a min reduce is performed to find the global minimum

The idea is to directly update the minimum value via transform_output_iterator. This can be done via atomics (in case of raw pointers for temp minima) or via locks (in case of iterators for temp minima. not shown in this answer).

To avoid atomic contention, the number of temporary minima should not be too small.

For 1G segments of size 1,i.e. there will be an atomic operation for each input element, I observe the following timings on an A100 GPU.

time approach 1 (standard): 13.2674 ms.
time approach 2 (fused): 38.0479 ms. (minimaSlots = 1)
time approach 2 (fused): 23.9251 ms. (minimaSlots = 1024)
time approach 2 (fused): 10.1109 ms. (minimaSlots = 1024 * 1024)

#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <thrust/iterator/discard_iterator.h>

#include <iostream>
#include <vector>
#include <limits>

template<size_t size>
struct UpdateMinimumOp{
    int* minPtr;

    UpdateMinimumOp(int* ptr):minPtr(ptr){}
    __device__
    int operator()(int value){
     // select output slot for minimum based on thread id
        const size_t pos = size_t(threadIdx.x) + size_t(blockIdx.x) * size_t(blockDim.x);
        const size_t minPos = pos % size;

        atomicMin(minPtr + minPos, value);
        return value;
    }
};

int main(){
    cudaEvent_t a; cudaEventCreate(&a);
    cudaEvent_t b; cudaEventCreate(&b);
    float t; 

    size_t N =  1ull << 30;
    thrust::device_vector<int> keys(N);
    thrust::device_vector<int> values(N);
    thrust::sequence(keys.begin(), keys.end(), 0);
    thrust::sequence(values.begin(), values.end(), 1);

    //Approach 1 (for timing comparison). max Reduce_by_key. then min reduce
    thrust::device_vector<int> maxima(N);

    cudaEventRecord(a);

    thrust::reduce_by_key(
        keys.begin(),
        keys.end(),
        values.begin(),
        thrust::make_discard_iterator(),
        maxima.begin(),
        thrust::equal_to<int>{},
        thrust::maximum<int>{}
    );

    int minimumApproach1 = thrust::reduce(maxima.begin(), maxima.end(), std::numeric_limits<int>::max(), thrust::minimum<int>{});

    cudaEventRecord(b);
    cudaEventSynchronize(b);
    cudaEventElapsedTime(&t, a,b);
    std::cout << "time approach 1 (standard): " << t << " ms. minimum: " <<minimumApproach1 << "\n";


    //Approach 2. Fuse max Reduce_by_key with the computation of the minimaSlots smallest maxima. then min reduce the stored smallest maxima
    //constexpr size_t minimaSlots = 1; 
    //constexpr size_t minimaSlots = 1024; 
    constexpr size_t minimaSlots = 1024*1024;
   
    thrust::device_vector<int> minima_of_maxima(minimaSlots);
    thrust::fill(minima_of_maxima.begin(), minima_of_maxima.end(), std::numeric_limits<int>::max());

    auto minimaOfMaximaIterator = thrust::make_transform_output_iterator(
        thrust::make_discard_iterator(),
        UpdateMinimumOp<minimaSlots>{minima_of_maxima.data().get()}
    );

    cudaEventRecord(a);

    thrust::reduce_by_key(
        keys.begin(),
        keys.end(),
        values.begin(),
        thrust::make_discard_iterator(),
        minimaOfMaximaIterator,
        thrust::equal_to<int>{},
        thrust::maximum<int>{}
    );

    int minimumApproach2 = thrust::reduce(minima_of_maxima.begin(), minima_of_maxima.end(), std::numeric_limits<int>::max(), thrust::minimum<int>{});

    cudaEventRecord(b);
    cudaEventSynchronize(b);
    cudaEventElapsedTime(&t, a,b);
    std::cout << "time approach 2 (fused): " << t << " ms. minimum: " << minimumApproach2 << "\n";

    cudaEventDestroy(a);
    cudaEventDestroy(b);
}