I'm testing the latest standalone Asio (asio-1.30.2) to asynchronously read many files (~17000) in a directory and all subdirectories, and I see terrible performance when it's done in the async way (vs the normal way).
// All necessary includes
static constexpr const char* DIR = "<your directory>";
static constexpr size_t DATA_SIZE = 16 * 1024;
asio::io_context io_context;
asio::strand strand = asio::make_strand(io_context);
template<typename Iter>
void hash(Iter begin, Iter end, uint32_t& hash) {
for (auto it = begin; it != end; it++) {
hash += *it;
}
}
std::vector<std::thread> start_threads(int num_threads) {
std::vector<std::thread> threads;
for (int i = 0; i < num_threads; ++i) {
threads.emplace_back([]() { io_context.run(); });
}
return threads;
}
asio::awaitable<void> read_file_async(std::string fileName)
{
asio::stream_file file{ strand, fileName, asio::stream_file::flags::read_only };
std::array<char, DATA_SIZE> data;
uint32_t h = 0;
for (;;) {
auto [ec, n] = co_await asio::async_read(file, asio::buffer(data), asio::transfer_at_least(data.size()), asio::as_tuple(asio::use_awaitable));
if (!ec) {
hash(data.begin(), data.begin() + n, h);
}
else if (ec == asio::error::eof) {
hash(data.begin(), data.begin() + n, h);
co_return;
}
else {
std::cout << "Error: " << ec << "\n";
co_return;
}
}
}
int main()
{
#if 1
{
auto start = std::chrono::steady_clock::now();
for (const auto& entry : std::filesystem::recursive_directory_iterator(DIR, std::filesystem::directory_options::follow_directory_symlink)) {
if (!entry.is_regular_file())
continue;
auto filePath = std::filesystem::path(entry.path()).lexically_normal().generic_string();
asio::stream_file file{ io_context, filePath, asio::stream_file::flags::read_only };
std::array<char, DATA_SIZE> data = { 0 };
uint32_t h = 0;
size_t n = 0;
try {
n = asio::read(file, asio::buffer(data), asio::transfer_at_least(data.size()));
hash(data.begin(), data.begin() + n, h);
}
catch (const std::system_error& e) {
if (e.code() == asio::error::eof && n > 0) {
hash(data.begin(), data.begin() + n, h);
};
}
}
auto end = std::chrono::steady_clock::now();
std::cout << "Execution (sync) time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " milliseconds." << std::endl;
}
#endif
#if 1
{
auto start = std::chrono::steady_clock::now();
auto workGuard = asio::make_work_guard(strand);
auto threads = start_threads(8);
for (const auto& entry : std::filesystem::recursive_directory_iterator(DIR, std::filesystem::directory_options::follow_directory_symlink)) {
if (!entry.is_regular_file())
continue;
auto filePath = std::filesystem::path(entry.path()).lexically_normal().generic_string();
asio::co_spawn(
strand,
read_file_async(filePath),
asio::detached
);
}
workGuard.reset();
for (auto& thread : threads)
thread.join();
auto end = std::chrono::steady_clock::now();
std::cout << "Execution (async) time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " milliseconds." << std::endl;
}
#endif
return 0;
}
This one gives me:
Execution (sync) time: 3013 milliseconds.
Execution (async) time: 27203 milliseconds.
on Windows (IOCP), compiled in MSVC with "Release" mode.
on Linux (liburing), g++ compiler with -std=c++20 -O3 options:
Execution (sync) time: 143 milliseconds.
Execution (async) time: 21275 milliseconds.
So basically the performance is 10x to 150x times worse.
If I profile Linux version with strace -fc (synced part is commented out):
strace -fc ./a.out
strace: Process 7087 attached
strace: Process 7088 attached
strace: Process 7089 attached
strace: Process 7090 attached
strace: Process 7091 attached
strace: Process 7092 attached
strace: Process 7093 attached
strace: Process 7094 attached
Execution (async) time: 127614 milliseconds.
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ------------------
88.20 566.134189 223 2527624 75001 futex
8.55 54.877000 227 241362 epoll_wait
1.86 11.966419 49 240335 io_uring_enter
0.94 6.015802 25 237095 3744 read
0.16 1.058005 59 17664 openat
0.13 0.820848 47 17291 fadvise64
0.08 0.541784 30 17669 close
0.03 0.216767 55 3935 mprotect
0.02 0.122419 56 2160 brk
0.01 0.046680 63 740 getdents64
0.00 0.031456 42 732 fcntl
0.00 0.028376 89 317 epoll_ctl
0.00 0.020176 16 1257 madvise
0.00 0.015767 42 373 newfstatat
0.00 0.001766 40 44 mmap
0.00 0.001036 31 33 rt_sigprocmask
0.00 0.000691 53 13 munmap
0.00 0.000646 80 8 clone3
0.00 0.000375 375 1 io_uring_setup
0.00 0.000341 37 9 rseq
0.00 0.000226 25 9 set_robust_list
0.00 0.000107 53 2 eventfd2
0.00 0.000049 12 4 pread64
0.00 0.000044 22 2 write
0.00 0.000023 23 1 timerfd_create
0.00 0.000014 14 1 io_uring_register
0.00 0.000013 13 1 rt_sigaction
0.00 0.000013 13 1 epoll_create1
0.00 0.000012 12 1 getrandom
0.00 0.000011 5 2 1 arch_prctl
0.00 0.000011 11 1 set_tid_address
0.00 0.000011 11 1 prlimit64
0.00 0.000000 0 1 1 access
0.00 0.000000 0 1 execve
------ ----------- ----------- --------- --------- ------------------
100.00 641.901077 194 3308690 78747 total
So basically asio spends most of its time in some sort of mutex. Clearly I do something wrong to get such disappointing results, but I don't know how to improve the code to leverage asynchronous read of files.
Additionally I tried changing number of threads in the async sessions. For example to 1. On Windows it gives sameish results, on Linux it executes so slow I couldn't even wait till it finishes (maybe it deadlocks).
Also I tried using std::counting_semaphore sem(various values), acquire that semaphore in the loop:
for (const auto& entry : std::filesystem::recursive_directory_iterator(DIR, std::filesystem::directory_options::follow_directory_symlink)) {
// acquire
}
and releasing it in
if (e.code() == asio::error::eof && n > 0) {
// release
}
This did help on Linux quite a bit for
std::counting_semaphore sem(8);
auto threads = start_threads(1);
Execution (sync) time: 141 milliseconds.
Execution (async) time: 2717 milliseconds.
but it gets worse if I increase the number of threads:
std::counting_semaphore sem(8);
auto threads = start_threads(2);
Execution (sync) time: 143 milliseconds.
Execution (async) time: 6048 milliseconds.
Overall in any case it's more than 10x slowdown compared to the synced code.
I think the assumptions are the problem here. What makes you think async IO Is faster here?
Async operation scheduling benefits scenarios where IO operations have varying completion times: that way, whichever operation completes first can be serviced first.
However, for disk IO there are specifics. Usually disk IO - on the same volume - has very predictable latency. What's more, some hardware strongly favors sequential access. In that respect, doing asynchronous IO, or even multi-threaded IO from multiple threads, is likely to HURT performance. This is e.g. very typical for magnetic storage (spinning HDD).
Besides, platforms have different ways to hint at sequential reading which allows the kernel/VFS to actively pre-fetch data.
So all in all
I ran your program on my machine. I pointed it at my /home/sehe/custom/ which contains my modular boost git checkout, among others (138792 files in 17939 directories, totaling 3.8GiB).
Without any changes except the necessary defines:
target_compile_definitions(sotest PRIVATE ASIO_HAS_IO_URING)
target_compile_definitions(sotest PRIVATE ASIO_HAS_FILE)
Repeated measurements show performance REGARDLESS of disk IO because of kernel block cache:
Also disabling EPOLL slightly improves it:
target_compile_definitions(sotest PRIVATE ASIO_DISABLE_EPOLL)
Then I realized I was still running with -fsanitize=undefined,address
, so disabling that:
Voila! Our first hint that async/multi thread slows us down? Let's swap the order:
Is it multi-threading or async? Let's remove the treads:
As you can see, any significant slowdown SHOULD be down to the filesystem/media performance under different access patterns.
Using root privileges, we can forcibly drop the caches prior to the benchmarks:
void drop_caches() {
// needs root privs
#if 0
std::ofstream("/proc/sys/vm/drop_caches") << 3 << std::endl;
std::this_thread::sleep_for(std::chrono::seconds(3));
#endif
}
Re-enabling the threads (8):
Once again, async IO helps, but threads hurt. This is likely because of the kernel's prefetch.
I'd suggest making a queue of files to scan and have a configurable number of threads read them. That makes it so that each "file" naturally has thread affinity. This allows the kernel to predict access patterns and benefit from block prefetch.
Ideally, you'd add platform specific calls to hint the kernel about the intent to read the entire file sequentially (posix_fadvice with POSIX_FADV_SEQUENTIAL, Win32 FILE_FLAG_SEQUENTIAL_SCAN).
I expect that file mappings work even better (using madvise instead, same on windows).
I could draw up a test program to test the difference on linux.
#include <asio.hpp>
#include <asio/stream_file.hpp>
#include <filesystem>
#include <iostream>
#include <fstream>
// All necessary includes
static constexpr const char* DIR = "/home/sehe/custom/";
static constexpr size_t DATA_SIZE = 16 * 1024;
asio::io_context io_context;
asio::strand strand = asio::make_strand(io_context);
template <typename Iter> void hash(Iter begin, Iter end, uint32_t& hash) {
for (auto it = begin; it != end; it++)
hash += static_cast<uint32_t>(*it);
}
std::vector<std::thread> start_threads(unsigned num_threads) {
std::vector<std::thread> threads;
for (unsigned i = 0; i < num_threads; ++i)
threads.emplace_back([]() { io_context.run(); });
return threads;
}
asio::awaitable<void> read_file_async(std::string fileName)
{
asio::stream_file file{ strand, fileName, asio::stream_file::flags::read_only };
std::array<char, DATA_SIZE> data;
uint32_t h = 0;
for (;;) {
auto [ec, n] = co_await asio::async_read(file, asio::buffer(data), asio::transfer_at_least(data.size()), asio::as_tuple(asio::use_awaitable));
if (!ec) {
hash(data.begin(), data.begin() + n, h);
}
else if (ec == asio::error::eof) {
hash(data.begin(), data.begin() + n, h);
co_return;
}
else {
std::cout << "Error: " << ec << "\n";
co_return;
}
}
}
void drop_caches() {
// needs root privs
#if 1
std::ofstream("/proc/sys/vm/drop_caches") << 3 << std::endl;
std::this_thread::sleep_for(std::chrono::seconds(3));
#endif
}
void bench_sync() {
drop_caches();
auto start = std::chrono::steady_clock::now();
for (const auto& entry : std::filesystem::recursive_directory_iterator(DIR, std::filesystem::directory_options::follow_directory_symlink)) {
if (!entry.is_regular_file())
continue;
auto filePath = std::filesystem::path(entry.path()).lexically_normal().generic_string();
asio::stream_file file{ io_context, filePath, asio::stream_file::flags::read_only };
std::array<char, DATA_SIZE> data = { 0 };
uint32_t h = 0;
size_t n = 0;
try {
n = asio::read(file, asio::buffer(data), asio::transfer_at_least(data.size()));
hash(data.begin(), data.begin() + n, h);
} catch (std::system_error const& e) {
if (e.code() == asio::error::eof && n > 0) {
hash(data.begin(), data.begin() + n, h);
};
}
}
auto end = std::chrono::steady_clock::now();
std::cout << "Execution (sync) time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " milliseconds." << std::endl;
}
void bench_async() {
drop_caches();
auto start = std::chrono::steady_clock::now();
auto workGuard = asio::make_work_guard(strand);
auto threads = start_threads(8);
for (const auto& entry : std::filesystem::recursive_directory_iterator(DIR, std::filesystem::directory_options::follow_directory_symlink)) {
if (!entry.is_regular_file())
continue;
auto filePath = std::filesystem::path(entry.path()).lexically_normal().generic_string();
asio::co_spawn(
strand,
read_file_async(filePath),
asio::detached
);
}
workGuard.reset();
for (auto& thread : threads)
thread.join();
auto end = std::chrono::steady_clock::now();
std::cout << "Execution (async) time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " milliseconds." << std::endl;
}
int main() {
bench_async();
bench_sync();
}
Still using asio thread-pool for the work queuing, this handles all files in a single directory sequentially, but queues subdirectories. You will have to figure out whether this method of queueing works for you (if you have many small/empty directories, it might not):
Live On Coliru (using Boost Asio and "." on Coliru)
#include <asio.hpp>
#include <boost/iostreams/device/mapped_file.hpp>
#include <boost/iostreams/stream.hpp>
#include <filesystem>
#include <iostream>
namespace fs = std::filesystem;
template <typename Iter> void hash(Iter begin, Iter end, uint32_t& hash) {
for (auto it = begin; it != end; it++)
hash += static_cast<uint32_t>(*it);
}
void hash_mapped_file(fs::path const& fileName) {
std::cout << "Hashing " << fileName << std::endl;
uint32_t h = 0;
boost::iostreams::mapped_file_source file(fileName);
hash(file.begin(), file.end(), h);
}
void hash_dir(fs::path const& root) {
auto start = std::chrono::steady_clock::now();
asio::thread_pool pool(8);
auto process = [ex = pool.get_executor()](this auto const& self, fs::path const& dir) -> void {
for (auto const& entry : fs::directory_iterator(dir))
if (entry.is_directory())
post(ex, std::bind(self, entry.path()));
else if (entry.is_regular_file() && entry.file_size()) // cannot map 0-sized files
hash_mapped_file(entry.path());
};
process(root);
pool.join();
auto end = std::chrono::steady_clock::now();
std::cout << "Execution time: "
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " ms"
<< std::endl;
}
int main() { hash_dir("/home/sehe/custom/"); }
This runs circles around any of the prior solutions. I actually added
std::cout << "Hashing " << fileName << "\n";
to make sure that it was actually doing all the work... Even with the added console IO it's ~300ms tops: