c++cudathrustallocatormemory-pool

memory pool in thrust execution policy


I am looking for solutions to use a memory pool within thrust as I want to limit the number of calls to cudaMalloc. device_vector definitely accepts an allocator, but it's not so easy to deal with thrust::sort which apparently will allocate a temporary buffer.

Based on the answer to How to use CUDA Thrust execution policy to override Thrust's low-level device memory allocator it seems that Thrust can be hooked to use special allocators by tweaking the execution policy, but it's quite old and I can't seem to find any doc about execution policies that explain how to proceed exactly.

For completeness, there is thrust/examples/cuda/custom_temporary_allocation.cu, but it's not very satisfying as it's using a memory pool hooked as a global variable.

I think it would be quite likely that the Thrust developer have thought about that, and would have included some mechanism to allow injecting a custom memory manager within the exec policy, I just can't find it.


Solution

  • The following is an example allocator for stream-ordered memory allocation that uses cudaMallocAsync to allocate from the default cuda memory pool on a specific stream. Together with the par_nosync execution policy, this allows for fully asynchronous thrust::sort.

    #include <thrust/device_malloc_allocator.h>
    
    template <class T>
    struct ThrustAllocatorAsync : public thrust::device_malloc_allocator<T> {
    public:
        using Base      = thrust::device_malloc_allocator<T>;
        using pointer   = typename Base::pointer;
        using size_type = typename Base::size_type;
    
        ThrustAllocatorAsync(cudaStream_t stream_) : stream{stream_} {}
    
        pointer allocate(size_type num){
            T* result = nullptr;
            cudaMallocAsync(&result, sizeof(T) * num, stream);
            return thrust::device_pointer_cast(result);
        }
    
        void deallocate(pointer ptr, size_type num){
            cudaFreeAsync(thrust::raw_pointer_cast(ptr), stream);
        }
    
    private:
        cudaStream_t stream;
    };
    
    ...
    
    thrust::sort(
       thrust::cuda::par_nosync(ThrustAllocatorAsync<char>(stream)).on(stream),
       data.begin(),
       data.end()
    );
    
    

    The same can be achieved with RMM as suggested in the comments.

    #include <rmm/mr/device/cuda_async_memory_resource.hpp> 
    #include <rmm/exec_policy.hpp>
    
    ...
    // could use any other class derived from rmm::mr::device_memory_resource
    rmm::mr::cuda_async_memory_resource mr; 
    
    thrust::sort(
       rmm::exec_policy_nosync(stream, &mr),
       data.begin(),
       data.end()
    );