c++boost-asioc++-coroutine

asio::async_read performance for many files


I'm testing the latest standalone Asio (asio-1.30.2) to asynchronously read many files (~17000) in a directory and all subdirectories, and I see terrible performance when it's done in the async way (vs the normal way).

// All necessary includes

static constexpr const char* DIR = "<your directory>";
static constexpr size_t DATA_SIZE = 16 * 1024;

asio::io_context io_context;
asio::strand strand = asio::make_strand(io_context);

template<typename Iter>
void hash(Iter begin, Iter end, uint32_t& hash) {
    for (auto it = begin; it != end; it++) {
        hash += *it;
    }
}


std::vector<std::thread> start_threads(int num_threads) {
    std::vector<std::thread> threads;
    for (int i = 0; i < num_threads; ++i) {
        threads.emplace_back([]() { io_context.run(); });
    }
    return threads;
}

asio::awaitable<void> read_file_async(std::string fileName)
{
    asio::stream_file file{ strand, fileName, asio::stream_file::flags::read_only };

    std::array<char, DATA_SIZE> data;
    uint32_t h = 0;
    for (;;) {
        auto [ec, n] = co_await asio::async_read(file, asio::buffer(data), asio::transfer_at_least(data.size()), asio::as_tuple(asio::use_awaitable));
        if (!ec) {
            hash(data.begin(), data.begin() + n, h);
        }
        else if (ec == asio::error::eof) {
            hash(data.begin(), data.begin() + n, h);
            co_return;
        }
        else {
            std::cout << "Error: " << ec << "\n";
            co_return;
        }
    }
}

int main()
{
#if 1
    {
        auto start = std::chrono::steady_clock::now();

        for (const auto& entry : std::filesystem::recursive_directory_iterator(DIR, std::filesystem::directory_options::follow_directory_symlink)) {
            if (!entry.is_regular_file())
                continue;

            auto filePath = std::filesystem::path(entry.path()).lexically_normal().generic_string();
            asio::stream_file file{ io_context, filePath, asio::stream_file::flags::read_only };

            std::array<char, DATA_SIZE> data = { 0 };
            uint32_t h = 0;

            size_t n = 0;
            try {
                n = asio::read(file, asio::buffer(data), asio::transfer_at_least(data.size()));
                hash(data.begin(), data.begin() + n, h);
            }
            catch (const std::system_error& e) {
                if (e.code() == asio::error::eof && n > 0) {
                    hash(data.begin(), data.begin() + n, h);
                };
            }
        }
        auto end = std::chrono::steady_clock::now();

        std::cout << "Execution (sync) time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " milliseconds." << std::endl;
    }
#endif
#if 1
    {
        auto start = std::chrono::steady_clock::now();
        auto workGuard = asio::make_work_guard(strand);
        auto threads = start_threads(8);

        for (const auto& entry : std::filesystem::recursive_directory_iterator(DIR, std::filesystem::directory_options::follow_directory_symlink)) {
            if (!entry.is_regular_file())
                continue;

            auto filePath = std::filesystem::path(entry.path()).lexically_normal().generic_string();
            asio::co_spawn(
                strand,
                read_file_async(filePath),
                asio::detached
            );
        }

        workGuard.reset();

        for (auto& thread : threads)
            thread.join();

        auto end = std::chrono::steady_clock::now();

        std::cout << "Execution (async) time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " milliseconds." << std::endl;
    }
#endif
    return 0;
}

This one gives me:

Execution (sync) time: 3013 milliseconds.
Execution (async) time: 27203 milliseconds.

on Windows (IOCP), compiled in MSVC with "Release" mode.

on Linux (liburing), g++ compiler with -std=c++20 -O3 options:

Execution (sync) time: 143 milliseconds.
Execution (async) time: 21275 milliseconds.

So basically the performance is 10x to 150x times worse.

If I profile Linux version with strace -fc (synced part is commented out):

strace -fc ./a.out
strace: Process 7087 attached
strace: Process 7088 attached
strace: Process 7089 attached
strace: Process 7090 attached
strace: Process 7091 attached
strace: Process 7092 attached
strace: Process 7093 attached
strace: Process 7094 attached
Execution (async) time: 127614 milliseconds.
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ------------------
 88.20  566.134189         223   2527624     75001 futex
  8.55   54.877000         227    241362           epoll_wait
  1.86   11.966419          49    240335           io_uring_enter
  0.94    6.015802          25    237095      3744 read
  0.16    1.058005          59     17664           openat
  0.13    0.820848          47     17291           fadvise64
  0.08    0.541784          30     17669           close
  0.03    0.216767          55      3935           mprotect
  0.02    0.122419          56      2160           brk
  0.01    0.046680          63       740           getdents64
  0.00    0.031456          42       732           fcntl
  0.00    0.028376          89       317           epoll_ctl
  0.00    0.020176          16      1257           madvise
  0.00    0.015767          42       373           newfstatat
  0.00    0.001766          40        44           mmap
  0.00    0.001036          31        33           rt_sigprocmask
  0.00    0.000691          53        13           munmap
  0.00    0.000646          80         8           clone3
  0.00    0.000375         375         1           io_uring_setup
  0.00    0.000341          37         9           rseq
  0.00    0.000226          25         9           set_robust_list
  0.00    0.000107          53         2           eventfd2
  0.00    0.000049          12         4           pread64
  0.00    0.000044          22         2           write
  0.00    0.000023          23         1           timerfd_create
  0.00    0.000014          14         1           io_uring_register
  0.00    0.000013          13         1           rt_sigaction
  0.00    0.000013          13         1           epoll_create1
  0.00    0.000012          12         1           getrandom
  0.00    0.000011           5         2         1 arch_prctl
  0.00    0.000011          11         1           set_tid_address
  0.00    0.000011          11         1           prlimit64
  0.00    0.000000           0         1         1 access
  0.00    0.000000           0         1           execve
------ ----------- ----------- --------- --------- ------------------
100.00  641.901077         194   3308690     78747 total

So basically asio spends most of its time in some sort of mutex. Clearly I do something wrong to get such disappointing results, but I don't know how to improve the code to leverage asynchronous read of files.

Additionally I tried changing number of threads in the async sessions. For example to 1. On Windows it gives sameish results, on Linux it executes so slow I couldn't even wait till it finishes (maybe it deadlocks).

Also I tried using std::counting_semaphore sem(various values), acquire that semaphore in the loop:

for (const auto& entry : std::filesystem::recursive_directory_iterator(DIR, std::filesystem::directory_options::follow_directory_symlink)) {
 // acquire
}

and releasing it in

if (e.code() == asio::error::eof && n > 0) {
 // release
}

This did help on Linux quite a bit for

std::counting_semaphore sem(8);
auto threads = start_threads(1);
Execution (sync) time: 141 milliseconds.
Execution (async) time: 2717 milliseconds.

but it gets worse if I increase the number of threads:

std::counting_semaphore sem(8);
auto threads = start_threads(2);
Execution (sync) time: 143 milliseconds.
Execution (async) time: 6048 milliseconds.

Overall in any case it's more than 10x slowdown compared to the synced code.


Solution

  • I think the assumptions are the problem here. What makes you think async IO Is faster here?

    Async operation scheduling benefits scenarios where IO operations have varying completion times: that way, whichever operation completes first can be serviced first.

    However, for disk IO there are specifics. Usually disk IO - on the same volume - has very predictable latency. What's more, some hardware strongly favors sequential access. In that respect, doing asynchronous IO, or even multi-threaded IO from multiple threads, is likely to HURT performance. This is e.g. very typical for magnetic storage (spinning HDD).

    Besides, platforms have different ways to hint at sequential reading which allows the kernel/VFS to actively pre-fetch data.

    So all in all

    SSD linux benchmark

    I ran your program on my machine. I pointed it at my /home/sehe/custom/ which contains my modular boost git checkout, among others (138792 files in 17939 directories, totaling 3.8GiB).

    Without any changes except the necessary defines:

    target_compile_definitions(sotest PRIVATE ASIO_HAS_IO_URING)
    target_compile_definitions(sotest PRIVATE ASIO_HAS_FILE)
    

    Repeated measurements show performance REGARDLESS of disk IO because of kernel block cache:

    enter image description here

    Also disabling EPOLL slightly improves it:

    target_compile_definitions(sotest PRIVATE ASIO_DISABLE_EPOLL)
    

    enter image description here

    Then I realized I was still running with -fsanitize=undefined,address, so disabling that:

    enter image description here

    Voila! Our first hint that async/multi thread slows us down? Let's swap the order: enter image description here

    Is it multi-threading or async? Let's remove the treads:

    enter image description here

    As you can see, any significant slowdown SHOULD be down to the filesystem/media performance under different access patterns.

    Dropping disk cache

    Using root privileges, we can forcibly drop the caches prior to the benchmarks:

    void drop_caches() {
        // needs root privs
    #if 0
        std::ofstream("/proc/sys/vm/drop_caches") << 3 << std::endl;
        std::this_thread::sleep_for(std::chrono::seconds(3));
    #endif
    }
    

    enter image description here

    Re-enabling the threads (8):

    enter image description here

    Once again, async IO helps, but threads hurt. This is likely because of the kernel's prefetch.

    Suggestions

    I'd suggest making a queue of files to scan and have a configurable number of threads read them. That makes it so that each "file" naturally has thread affinity. This allows the kernel to predict access patterns and benefit from block prefetch.

    Ideally, you'd add platform specific calls to hint the kernel about the intent to read the entire file sequentially (posix_fadvice with POSIX_FADV_SEQUENTIAL, Win32 FILE_FLAG_SEQUENTIAL_SCAN).

    I expect that file mappings work even better (using madvise instead, same on windows).

    I could draw up a test program to test the difference on linux.

    Listing used for above benchmarks

    #include <asio.hpp>
    #include <asio/stream_file.hpp>
    #include <filesystem>
    #include <iostream>
    #include <fstream>
    // All necessary includes
    
    static constexpr const char* DIR = "/home/sehe/custom/";
    static constexpr size_t DATA_SIZE = 16 * 1024;
    
    asio::io_context io_context;
    asio::strand strand = asio::make_strand(io_context);
    
    template <typename Iter> void hash(Iter begin, Iter end, uint32_t& hash) {
        for (auto it = begin; it != end; it++)
            hash += static_cast<uint32_t>(*it);
    }
    
    std::vector<std::thread> start_threads(unsigned num_threads) {
        std::vector<std::thread> threads;
        for (unsigned i = 0; i < num_threads; ++i)
            threads.emplace_back([]() { io_context.run(); });
        return threads;
    }
    
    asio::awaitable<void> read_file_async(std::string fileName)
    {
        asio::stream_file file{ strand, fileName, asio::stream_file::flags::read_only };
    
        std::array<char, DATA_SIZE> data;
        uint32_t h = 0;
        for (;;) {
            auto [ec, n] = co_await asio::async_read(file, asio::buffer(data), asio::transfer_at_least(data.size()), asio::as_tuple(asio::use_awaitable));
            if (!ec) {
                hash(data.begin(), data.begin() + n, h);
            }
            else if (ec == asio::error::eof) {
                hash(data.begin(), data.begin() + n, h);
                co_return;
            }
            else {
                std::cout << "Error: " << ec << "\n";
                co_return;
            }
        }
    }
    
    void drop_caches() {
        // needs root privs
    #if 1
        std::ofstream("/proc/sys/vm/drop_caches") << 3 << std::endl;
        std::this_thread::sleep_for(std::chrono::seconds(3));
    #endif
    }
    
    void bench_sync() {
        drop_caches();
        auto start = std::chrono::steady_clock::now();
    
        for (const auto& entry : std::filesystem::recursive_directory_iterator(DIR, std::filesystem::directory_options::follow_directory_symlink)) {
            if (!entry.is_regular_file())
                continue;
    
            auto filePath = std::filesystem::path(entry.path()).lexically_normal().generic_string();
            asio::stream_file file{ io_context, filePath, asio::stream_file::flags::read_only };
    
            std::array<char, DATA_SIZE> data = { 0 };
            uint32_t h = 0;
    
            size_t n = 0;
            try {
                n = asio::read(file, asio::buffer(data), asio::transfer_at_least(data.size()));
                hash(data.begin(), data.begin() + n, h);
            } catch (std::system_error const& e) {
                if (e.code() == asio::error::eof && n > 0) {
                    hash(data.begin(), data.begin() + n, h);
                };
            }
        }
        auto end = std::chrono::steady_clock::now();
    
        std::cout << "Execution (sync) time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " milliseconds." << std::endl;
    }
    
    void bench_async() {
        drop_caches();
        auto start     = std::chrono::steady_clock::now();
        auto workGuard = asio::make_work_guard(strand);
        auto threads = start_threads(8);
    
        for (const auto& entry : std::filesystem::recursive_directory_iterator(DIR, std::filesystem::directory_options::follow_directory_symlink)) {
            if (!entry.is_regular_file())
                continue;
    
            auto filePath = std::filesystem::path(entry.path()).lexically_normal().generic_string();
            asio::co_spawn(
                    strand,
                    read_file_async(filePath),
                    asio::detached
                    );
        }
    
        workGuard.reset();
    
        for (auto& thread : threads)
            thread.join();
    
        auto end = std::chrono::steady_clock::now();
    
        std::cout << "Execution (async) time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " milliseconds." << std::endl;
    }
    
    int main() {
        bench_async();
        bench_sync();
    }
    

    BONUS: Memory mapping and "queueing"

    Still using asio thread-pool for the work queuing, this handles all files in a single directory sequentially, but queues subdirectories. You will have to figure out whether this method of queueing works for you (if you have many small/empty directories, it might not):

    Live On Coliru (using Boost Asio and "." on Coliru)

    #include <asio.hpp>
    #include <boost/iostreams/device/mapped_file.hpp>
    #include <boost/iostreams/stream.hpp>
    #include <filesystem>
    #include <iostream>
    namespace fs = std::filesystem;
    
    
    template <typename Iter> void hash(Iter begin, Iter end, uint32_t& hash) {
        for (auto it = begin; it != end; it++)
            hash += static_cast<uint32_t>(*it);
    }
    
    void hash_mapped_file(fs::path const& fileName) {
        std::cout << "Hashing " << fileName << std::endl;
        uint32_t h = 0;
        boost::iostreams::mapped_file_source file(fileName);
        hash(file.begin(), file.end(), h);
    }
    
    void hash_dir(fs::path const& root) {
        auto start = std::chrono::steady_clock::now();
    
        asio::thread_pool pool(8);
    
        auto process = [ex = pool.get_executor()](this auto const& self, fs::path const& dir) -> void {
            for (auto const& entry : fs::directory_iterator(dir))
                if (entry.is_directory())
                    post(ex, std::bind(self, entry.path()));
                else if (entry.is_regular_file() && entry.file_size()) // cannot map 0-sized files
                    hash_mapped_file(entry.path());
        };
    
        process(root);
    
        pool.join();
    
        auto end = std::chrono::steady_clock::now();
    
        std::cout << "Execution time: "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " ms"
                  << std::endl;
    }
    
    int main() { hash_dir("/home/sehe/custom/"); }
    

    This runs circles around any of the prior solutions. I actually added

    std::cout << "Hashing " << fileName << "\n";
    

    to make sure that it was actually doing all the work... Even with the added console IO it's ~300ms tops:

    enter image description here