c++c++17google-benchmark

Length error when trying to create a std::wstring to store result of std::mbsrtowcs


I am trying to measure how much time it takes to read a text file (std::ifstream) using different approaches and create a std::wstring from its contents. For creating a std::wstring, I had created two methods, one to convert character by character in a loop (std::mbrtowc), and other to convert all the content in one go (std::mbsrtowcs).

The text file that I am reading consists of 3369045 bytes and is expected to have 3324222 (wide) characters.

Converting character by character seems to work fine, but converting all in one go, leads to the following error:

terminate called after throwing an instance of 'std::length_error'
  what():  basic_string::_M_create
Aborted (core dumped)

I understand that it is thrown when creating a large std::wstring. Is it possible to fix this? If yes, how? Also, if I am doing something wrong, please let me know.

Code:

#include <iostream>
#include <fstream>
#include <filesystem>
#include <vector>
#include <cwchar>
#include <benchmark/benchmark.h>
#include <cassert>

static std::istream::pos_type get_file_size(std::istream& in)
{
    in.seekg(0, std::ios::end);
    auto pos = in.tellg();
    in.seekg(0, std::ios::beg);
    return pos;
}

std::wstring get_file_contents(const char* buffer, std::size_t size)
{
    std::mbstate_t state{};
    wchar_t current_wide_char;
    std::size_t remaining = size;
    std::size_t offset = 0;
    std::wstring output{};

    while (remaining > 0)
    {
        std::size_t len = std::mbrtowc(&current_wide_char, buffer + offset, remaining, &state);
        if (len == 0)
        {
            offset++;
            remaining--;
        }
        else if (len >= 1 && len <= remaining)
        {
            output += current_wide_char;
            offset += len;
            remaining -= len;
        }
        else if (len == static_cast<std::size_t>(-1))
        {
            std::cerr << "Failed: encoding error after offset " << offset << '\n';
            break;
        }
        else if (len == static_cast<std::size_t>(-2))
        {
            std::cerr << "Failed: incomplete sequence\n";
            break; 
        }
    }
    return output;
}

std::wstring get_file_contents2(const char* buffer, std::size_t size)
{
    std::mbstate_t state{};
    std::size_t len = std::mbsrtowcs(nullptr, &buffer, 0, &state);
    std::wstring output(len, L'\0');
    std::mbsrtowcs(output.data(), &buffer, len, &state);
    return output;
}

static void istreambuf_iterator(benchmark::State& state)
{
    std::setlocale(LC_ALL, "");
    for (auto _ : state)
    {
        std::ifstream in{"test.txt"};
        if (in.good())
        {
            std::vector<char> buffer{std::istreambuf_iterator<char>(in), std::istreambuf_iterator<char>()};
            const auto size = get_file_size(in);
            in.close();
            assert(buffer.size() == size);
            assert(size == 3369045u);

            const char* ptr = buffer.data();
            auto output = get_file_contents(ptr, size); 
            assert(output.length() == 3324222u);
            benchmark::DoNotOptimize(buffer);
            benchmark::DoNotOptimize(output);
        }
    }
}

static void istream_read_vec(benchmark::State& state)
{
    for (auto _ : state)
    {
        std::ifstream in{"test.txt"};
        if (in.good())
        {
            const auto size = get_file_size(in);
            std::vector<char> buffer(size, '\0');
            in.read(buffer.data(), size);
            in.close();
            assert(buffer.size() == size);
            assert(size == 3369045u);

            const char* ptr = buffer.data();
            auto output = get_file_contents(ptr, size);
            assert(output.length() == 3324222u);
            benchmark::DoNotOptimize(buffer);
            benchmark::DoNotOptimize(output);
        }
    }
}
static void istream_read_dynamic_arr(benchmark::State& state)
{
    for (auto _ : state)
    {
        std::ifstream in{"test.txt"};
        if (in.good())
        {
            const auto size = get_file_size(in);
            char* buffer = new char[size];
            in.read(buffer, size);
            in.close();
            assert(size == 3369045u);
            
            auto output = get_file_contents(buffer, size);
            
            assert(output.length() == 3324222u);
            delete[] buffer;
            benchmark::DoNotOptimize(buffer);
            benchmark::DoNotOptimize(output);
        }
    }
}
static void istream_read_string(benchmark::State& state)
{
    for (auto _ : state)
    {
        std::ifstream in{"test.txt"};
        if (in.good())
        {
            const auto size = get_file_size(in);
            std::string buffer(size, '\0');
            in.read(buffer.data(), size);
            in.close();
            assert(buffer.size() == size);
            assert(size == 3369045u);

            const char* ptr = buffer.c_str();
            auto output = get_file_contents(ptr, size);

            assert(output.length() == 3324222u);
            benchmark::DoNotOptimize(buffer);
            benchmark::DoNotOptimize(output);
        }
    }
}
BENCHMARK(istreambuf_iterator);
BENCHMARK(istream_read_vec);
BENCHMARK(istream_read_dynamic_arr);
BENCHMARK(istream_read_string);
BENCHMARK_MAIN();

To reproduce the error, replace get_file_contents() in above code with get_file_contents2().

Output when running with get_file_contents():

2025-03-12T12:11:19+05:30
Running ./benchmark
Run on (16 X 2304 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 256 KiB (x8)
  L3 Unified 16384 KiB (x1)
Load Average: 0.20, 0.24, 0.18
-------------------------------------------------------------------
Benchmark                         Time             CPU   Iterations
-------------------------------------------------------------------
istreambuf_iterator        67161263 ns     67160588 ns            8
istream_read_vec           59514501 ns     59514555 ns           11
istream_read_dynamic_arr   63361615 ns     63299483 ns           12
istream_read_string        62134141 ns     62129082 ns           11

Output when running with get_file_contents2():

2025-03-12T12:12:11+05:30
Running ./benchmark
Run on (16 X 2304 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 256 KiB (x8)
  L3 Unified 16384 KiB (x1)
Load Average: 0.11, 0.21, 0.17
-------------------------------------------------------------------
Benchmark                         Time             CPU   Iterations
-------------------------------------------------------------------
istreambuf_iterator        14109594 ns     14109764 ns           44
terminate called after throwing an instance of 'std::length_error'
  what():  basic_string::_M_create
Aborted (core dumped)

Compiler: GCC version 11.4.0


Solution

  • The problem is that the input buffer passed to std::mbsrtowcs wasn't null terminated which caused undefined behavior. That's why sometimes std::mbsrtowcs returned static_cast<std::size_t>(-1) which is a very large number ultimately leading to length error when creating such a large std::wstring.

    This is evident from valgrind output logs:

    ==5619== Invalid read of size 1
    ==5619==    at 0x484ED24: strlen (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
    ==5619==    by 0x4C683D1: __mbsrtowcs_l (mbsrtowcs_l.c:72)
    ==5619==    by 0x114984: get_file_contents2[abi:cxx11](char const*, unsigned long) (test_locale.cpp:57)
    ==5619==    by 0x1152C6: istream_read_vec(benchmark::State&) (test_locale.cpp:114)
    ==5619==    by 0x142BD5: benchmark::internal::(anonymous namespace)::RunInThread(benchmark::internal::BenchmarkInstance const*, long, int, benchmark::internal::ThreadManager*, benchmark::internal::PerfCountersMeasurement*, benchmark::ProfilerManager*) (in /home/kshah/test_cpp/benchmark)
    ==5619==    by 0x144B01: benchmark::internal::BenchmarkRunner::DoNIterations() (in /home/kshah/test_cpp/benchmark)
    ==5619==    by 0x145BF8: benchmark::internal::BenchmarkRunner::DoOneRepetition() (in /home/kshah/test_cpp/benchmark)
    ==5619==    by 0x121188: benchmark::internal::(anonymous namespace)::RunBenchmarks(std::vector<benchmark::internal::BenchmarkInstance, std::allocator<benchmark::internal::BenchmarkInstance> > const&, benchmark::BenchmarkReporter*, benchmark::BenchmarkReporter*) (in /home/kshah/test_cpp/benchmark)
    ==5619==    by 0x122A68: benchmark::RunSpecifiedBenchmarks(benchmark::BenchmarkReporter*, benchmark::BenchmarkReporter*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) (in /home/kshah/test_cpp/benchmark)
    ==5619==    by 0x122C1D: benchmark::RunSpecifiedBenchmarks() (in /home/kshah/test_cpp/benchmark)
    ==5619==    by 0x115C59: main (test_locale.cpp:175)
    ==5619==  Address 0x6a0a895 is 0 bytes after a block of size 3,369,045 alloc'd
    ==5619==    at 0x4849013: operator new(unsigned long) (in /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so)
    ==5619==    by 0x1175CC: __gnu_cxx::new_allocator<char>::allocate(unsigned long, void const*) (new_allocator.h:127)
    ==5619==    by 0x117014: std::allocator_traits<std::allocator<char> >::allocate(std::allocator<char>&, unsigned long) (alloc_traits.h:464)
    ==5619==    by 0x117C71: std::_Vector_base<char, std::allocator<char> >::_M_allocate(unsigned long) (stl_vector.h:346)
    ==5619==    by 0x1174A6: std::_Vector_base<char, std::allocator<char> >::_M_create_storage(unsigned long) (stl_vector.h:361)
    ==5619==    by 0x116F08: std::_Vector_base<char, std::allocator<char> >::_Vector_base(unsigned long, std::allocator<char> const&) (stl_vector.h:305)
    ==5619==    by 0x116860: std::vector<char, std::allocator<char> >::vector(unsigned long, char const&, std::allocator<char> const&) (stl_vector.h:524)
    ==5619==    by 0x1151A9: istream_read_vec(benchmark::State&) (test_locale.cpp:106)
    

    In the valgrind output, it clearly states that 1 extra byte is being read illegally on the line std::size_t len = std::mbsrtowcs(nullptr, &buffer, 0, &state);

    Solution: Add an extra byte at the end of the buffer whose value is '\0'. One example code snippet is shown below:

    const auto size = get_file_size(in);
    std::vector<char> buffer(size, '\0');
    in.read(buffer.data(), size);
    buffer.push_back('\0');     // <---- add extra null ('\0') byte at the end of buffer
    in.close();
    

    After applying the above solution, the valgrind should no longer report Invalid read error and the issue should be resolved.