I am trying to measure how much time it takes to read a text file (std::ifstream) using different approaches and create a std::wstring
from its contents. For creating a std::wstring
, I had created two methods, one to convert character by character in a loop (std::mbrtowc
), and other to convert all the content in one go (std::mbsrtowcs
).
The text file that I am reading consists of 3369045 bytes and is expected to have 3324222 (wide) characters.
Converting character by character seems to work fine, but converting all in one go, leads to the following error:
terminate called after throwing an instance of 'std::length_error'
what(): basic_string::_M_create
Aborted (core dumped)
I understand that it is thrown when creating a large std::wstring
. Is it possible to fix this? If yes, how? Also, if I am doing something wrong, please let me know.
Code:
#include <iostream>
#include <fstream>
#include <filesystem>
#include <vector>
#include <cwchar>
#include <benchmark/benchmark.h>
#include <cassert>
static std::istream::pos_type get_file_size(std::istream& in)
{
in.seekg(0, std::ios::end);
auto pos = in.tellg();
in.seekg(0, std::ios::beg);
return pos;
}
std::wstring get_file_contents(const char* buffer, std::size_t size)
{
std::mbstate_t state{};
wchar_t current_wide_char;
std::size_t remaining = size;
std::size_t offset = 0;
std::wstring output{};
while (remaining > 0)
{
std::size_t len = std::mbrtowc(¤t_wide_char, buffer + offset, remaining, &state);
if (len == 0)
{
offset++;
remaining--;
}
else if (len >= 1 && len <= remaining)
{
output += current_wide_char;
offset += len;
remaining -= len;
}
else if (len == static_cast<std::size_t>(-1))
{
std::cerr << "Failed: encoding error after offset " << offset << '\n';
break;
}
else if (len == static_cast<std::size_t>(-2))
{
std::cerr << "Failed: incomplete sequence\n";
break;
}
}
return output;
}
std::wstring get_file_contents2(const char* buffer, std::size_t size)
{
std::mbstate_t state{};
std::size_t len = std::mbsrtowcs(nullptr, &buffer, 0, &state);
std::wstring output(len, L'\0');
std::mbsrtowcs(output.data(), &buffer, len, &state);
return output;
}
static void istreambuf_iterator(benchmark::State& state)
{
std::setlocale(LC_ALL, "");
for (auto _ : state)
{
std::ifstream in{"test.txt"};
if (in.good())
{
std::vector<char> buffer{std::istreambuf_iterator<char>(in), std::istreambuf_iterator<char>()};
const auto size = get_file_size(in);
in.close();
assert(buffer.size() == size);
assert(size == 3369045u);
const char* ptr = buffer.data();
auto output = get_file_contents(ptr, size);
assert(output.length() == 3324222u);
benchmark::DoNotOptimize(buffer);
benchmark::DoNotOptimize(output);
}
}
}
static void istream_read_vec(benchmark::State& state)
{
for (auto _ : state)
{
std::ifstream in{"test.txt"};
if (in.good())
{
const auto size = get_file_size(in);
std::vector<char> buffer(size, '\0');
in.read(buffer.data(), size);
in.close();
assert(buffer.size() == size);
assert(size == 3369045u);
const char* ptr = buffer.data();
auto output = get_file_contents(ptr, size);
assert(output.length() == 3324222u);
benchmark::DoNotOptimize(buffer);
benchmark::DoNotOptimize(output);
}
}
}
static void istream_read_dynamic_arr(benchmark::State& state)
{
for (auto _ : state)
{
std::ifstream in{"test.txt"};
if (in.good())
{
const auto size = get_file_size(in);
char* buffer = new char[size];
in.read(buffer, size);
in.close();
assert(size == 3369045u);
auto output = get_file_contents(buffer, size);
assert(output.length() == 3324222u);
delete[] buffer;
benchmark::DoNotOptimize(buffer);
benchmark::DoNotOptimize(output);
}
}
}
static void istream_read_string(benchmark::State& state)
{
for (auto _ : state)
{
std::ifstream in{"test.txt"};
if (in.good())
{
const auto size = get_file_size(in);
std::string buffer(size, '\0');
in.read(buffer.data(), size);
in.close();
assert(buffer.size() == size);
assert(size == 3369045u);
const char* ptr = buffer.c_str();
auto output = get_file_contents(ptr, size);
assert(output.length() == 3324222u);
benchmark::DoNotOptimize(buffer);
benchmark::DoNotOptimize(output);
}
}
}
BENCHMARK(istreambuf_iterator);
BENCHMARK(istream_read_vec);
BENCHMARK(istream_read_dynamic_arr);
BENCHMARK(istream_read_string);
BENCHMARK_MAIN();
To reproduce the error, replace get_file_contents()
in above code with get_file_contents2()
.
Output when running with get_file_contents()
:
2025-03-12T12:11:19+05:30
Running ./benchmark
Run on (16 X 2304 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x8)
L1 Instruction 32 KiB (x8)
L2 Unified 256 KiB (x8)
L3 Unified 16384 KiB (x1)
Load Average: 0.20, 0.24, 0.18
-------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------
istreambuf_iterator 67161263 ns 67160588 ns 8
istream_read_vec 59514501 ns 59514555 ns 11
istream_read_dynamic_arr 63361615 ns 63299483 ns 12
istream_read_string 62134141 ns 62129082 ns 11
Output when running with get_file_contents2()
:
2025-03-12T12:12:11+05:30
Running ./benchmark
Run on (16 X 2304 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x8)
L1 Instruction 32 KiB (x8)
L2 Unified 256 KiB (x8)
L3 Unified 16384 KiB (x1)
Load Average: 0.11, 0.21, 0.17
-------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------
istreambuf_iterator 14109594 ns 14109764 ns 44
terminate called after throwing an instance of 'std::length_error'
what(): basic_string::_M_create
Aborted (core dumped)
Compiler: GCC version 11.4.0
The problem is that the input buffer passed to std::mbsrtowcs
wasn't null terminated which caused undefined behavior. That's why sometimes std::mbsrtowcs
returned static_cast<std::size_t>(-1)
which is a very large number ultimately leading to length error when creating such a large std::wstring
.
This is evident from valgrind output logs:
==5619== Invalid read of size 1
==5619== at 0x484ED24: strlen (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==5619== by 0x4C683D1: __mbsrtowcs_l (mbsrtowcs_l.c:72)
==5619== by 0x114984: get_file_contents2[abi:cxx11](char const*, unsigned long) (test_locale.cpp:57)
==5619== by 0x1152C6: istream_read_vec(benchmark::State&) (test_locale.cpp:114)
==5619== by 0x142BD5: benchmark::internal::(anonymous namespace)::RunInThread(benchmark::internal::BenchmarkInstance const*, long, int, benchmark::internal::ThreadManager*, benchmark::internal::PerfCountersMeasurement*, benchmark::ProfilerManager*) (in /home/kshah/test_cpp/benchmark)
==5619== by 0x144B01: benchmark::internal::BenchmarkRunner::DoNIterations() (in /home/kshah/test_cpp/benchmark)
==5619== by 0x145BF8: benchmark::internal::BenchmarkRunner::DoOneRepetition() (in /home/kshah/test_cpp/benchmark)
==5619== by 0x121188: benchmark::internal::(anonymous namespace)::RunBenchmarks(std::vector<benchmark::internal::BenchmarkInstance, std::allocator<benchmark::internal::BenchmarkInstance> > const&, benchmark::BenchmarkReporter*, benchmark::BenchmarkReporter*) (in /home/kshah/test_cpp/benchmark)
==5619== by 0x122A68: benchmark::RunSpecifiedBenchmarks(benchmark::BenchmarkReporter*, benchmark::BenchmarkReporter*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) (in /home/kshah/test_cpp/benchmark)
==5619== by 0x122C1D: benchmark::RunSpecifiedBenchmarks() (in /home/kshah/test_cpp/benchmark)
==5619== by 0x115C59: main (test_locale.cpp:175)
==5619== Address 0x6a0a895 is 0 bytes after a block of size 3,369,045 alloc'd
==5619== at 0x4849013: operator new(unsigned long) (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
==5619== by 0x1175CC: __gnu_cxx::new_allocator<char>::allocate(unsigned long, void const*) (new_allocator.h:127)
==5619== by 0x117014: std::allocator_traits<std::allocator<char> >::allocate(std::allocator<char>&, unsigned long) (alloc_traits.h:464)
==5619== by 0x117C71: std::_Vector_base<char, std::allocator<char> >::_M_allocate(unsigned long) (stl_vector.h:346)
==5619== by 0x1174A6: std::_Vector_base<char, std::allocator<char> >::_M_create_storage(unsigned long) (stl_vector.h:361)
==5619== by 0x116F08: std::_Vector_base<char, std::allocator<char> >::_Vector_base(unsigned long, std::allocator<char> const&) (stl_vector.h:305)
==5619== by 0x116860: std::vector<char, std::allocator<char> >::vector(unsigned long, char const&, std::allocator<char> const&) (stl_vector.h:524)
==5619== by 0x1151A9: istream_read_vec(benchmark::State&) (test_locale.cpp:106)
In the valgrind output, it clearly states that 1 extra byte is being read illegally on the line std::size_t len = std::mbsrtowcs(nullptr, &buffer, 0, &state);
Solution: Add an extra byte at the end of the buffer whose value is '\0'
. One example code snippet is shown below:
const auto size = get_file_size(in);
std::vector<char> buffer(size, '\0');
in.read(buffer.data(), size);
buffer.push_back('\0'); // <---- add extra null ('\0') byte at the end of buffer
in.close();
After applying the above solution, the valgrind should no longer report Invalid read error and the issue should be resolved.