I am looking for solutions to use a memory pool within thrust as I want to limit the number of calls to cudaMalloc
.
device_vector
definitely accepts an allocator, but it's not so easy to deal with thrust::sort
which apparently will allocate a temporary buffer.
Based on the answer to How to use CUDA Thrust execution policy to override Thrust's low-level device memory allocator it seems that Thrust can be hooked to use special allocators by tweaking the execution policy, but it's quite old and I can't seem to find any doc about execution policies that explain how to proceed exactly.
For completeness, there is thrust/examples/cuda/custom_temporary_allocation.cu
, but it's not very satisfying as it's using a memory pool hooked as a global variable.
I think it would be quite likely that the Thrust developer have thought about that, and would have included some mechanism to allow injecting a custom memory manager within the exec policy, I just can't find it.
The following is an example allocator for stream-ordered memory allocation that uses cudaMallocAsync to allocate from the default cuda memory pool on a specific stream. Together with the par_nosync execution policy, this allows for fully asynchronous thrust::sort.
#include <thrust/device_malloc_allocator.h>
template <class T>
struct ThrustAllocatorAsync : public thrust::device_malloc_allocator<T> {
public:
using Base = thrust::device_malloc_allocator<T>;
using pointer = typename Base::pointer;
using size_type = typename Base::size_type;
ThrustAllocatorAsync(cudaStream_t stream_) : stream{stream_} {}
pointer allocate(size_type num){
T* result = nullptr;
cudaMallocAsync(&result, sizeof(T) * num, stream);
return thrust::device_pointer_cast(result);
}
void deallocate(pointer ptr, size_type num){
cudaFreeAsync(thrust::raw_pointer_cast(ptr), stream);
}
private:
cudaStream_t stream;
};
...
thrust::sort(
thrust::cuda::par_nosync(ThrustAllocatorAsync<char>(stream)).on(stream),
data.begin(),
data.end()
);
The same can be achieved with RMM as suggested in the comments.
#include <rmm/mr/device/cuda_async_memory_resource.hpp>
#include <rmm/exec_policy.hpp>
...
// could use any other class derived from rmm::mr::device_memory_resource
rmm::mr::cuda_async_memory_resource mr;
thrust::sort(
rmm::exec_policy_nosync(stream, &mr),
data.begin(),
data.end()
);