memory pool in thrust execution policy

I am looking for solutions to use a memory pool within thrust as I want to limit the number of calls to cudaMalloc. device_vector definitely accepts an allocator, but it's not so easy to deal with thrust::sort which apparently will allocate a temporary buffer.

Based on the answer to How to use CUDA Thrust execution policy to override Thrust's low-level device memory allocator it seems that Thrust can be hooked to use special allocators by tweaking the execution policy, but it's quite old and I can't seem to find any doc about execution policies that explain how to proceed exactly.

For completeness, there is thrust/examples/cuda/custom_temporary_allocation.cu, but it's not very satisfying as it's using a memory pool hooked as a global variable.

I think it would be quite likely that the Thrust developer have thought about that, and would have included some mechanism to allow injecting a custom memory manager within the exec policy, I just can't find it.

Solution

The following is an example allocator for stream-ordered memory allocation that uses cudaMallocAsync to allocate from the default cuda memory pool on a specific stream. Together with the par_nosync execution policy, this allows for fully asynchronous thrust::sort.

#include <thrust/device_malloc_allocator.h>

template <class T>
struct ThrustAllocatorAsync : public thrust::device_malloc_allocator<T> {
public:
    using Base      = thrust::device_malloc_allocator<T>;
    using pointer   = typename Base::pointer;
    using size_type = typename Base::size_type;

    ThrustAllocatorAsync(cudaStream_t stream_) : stream{stream_} {}

    pointer allocate(size_type num){
        T* result = nullptr;
        cudaMallocAsync(&result, sizeof(T) * num, stream);
        return thrust::device_pointer_cast(result);
    }

    void deallocate(pointer ptr, size_type num){
        cudaFreeAsync(thrust::raw_pointer_cast(ptr), stream);
    }

private:
    cudaStream_t stream;
};

...

thrust::sort(
   thrust::cuda::par_nosync(ThrustAllocatorAsync<char>(stream)).on(stream),
   data.begin(),
   data.end()
);

The same can be achieved with RMM as suggested in the comments.

#include <rmm/mr/device/cuda_async_memory_resource.hpp> 
#include <rmm/exec_policy.hpp>

...
// could use any other class derived from rmm::mr::device_memory_resource
rmm::mr::cuda_async_memory_resource mr; 

thrust::sort(
   rmm::exec_policy_nosync(stream, &mr),
   data.begin(),
   data.end()
);