c++boostboost-asioboost-beast

Access the http response body before the entire response has been read


I need to read the response from an HTTP server like this:

To read the body I use boost::beast::http::async_read. As the response body I need to use boost::beast::http::string_body. In other words, the response parser should have the following type:

boost::beast::http::response_parser<boost::beast::http::string_body>

I need to access the response body before the full response is received. When the response contains the Transfer-Encoding: chunked header, we can assign callback functions:

  1. beast::http::response_parser::on_chunk_header
  2. beast::http::response_parser::on_chunk_body

Then these functions will be called as chunks arrive, and we can access the response body data and process it before we get the full response of the entire body.

However, if the response does not contain Transfer-Encoding: chunked, but contains a Content-Length that is very long, say 200 megabytes, then we will have to wait a very long time for the entire response. But I need to start processing the data even before I get the full response. And when I get the full response, I need to call the callback function.

How can I solve this problem?


UPDATE

Ideally, I would like to get rid of on_chunk_header and on_chunk_body and get raw body chunks in the same way, regardless of whether the response was chunked or not.


Solution

  • Just to outline what I had in mind in my comment:

    Live On Coliru

    #include <boost/beast.hpp>
    #include <boost/lexical_cast.hpp>
    #include <fmt/ranges.h>
    #include <iostream>
    #include <span>
    namespace net   = boost::asio;
    namespace beast = boost::beast;
    namespace http  = beast::http;
    using boost::system::error_code;
    using net::ip::tcp;
    
    tcp::socket send_get() {
        net::system_executor ex;
        tcp::socket          s(ex);
        // connect(s, tcp::resolver(ex).resolve("httpbin.org", "http"));
        connect(s, tcp::resolver(ex).resolve("44.207.188.95", "80")); // For COLIRU, DNS is not available
    
        http::request<http::empty_body> req{http::verb::get, "/stream-bytes/2000?seed=42", 11};
        req.set(http::field::host, "httpbin.org");
        write(s, req);
    
        return s;
    }
    
    http::response<http::string_body> using_string_body() {
        tcp::socket conn = send_get();
    
        http::response<http::string_body> res;
        beast::flat_buffer                buf;
        read(conn, buf, res);
    
        std::cerr << "response: " << res.base() << "\n";
    
        std::span body = res.body();
        size_t const n = body.size();
        fmt::print("body, {} bytes: {::0x} ... {::0x}\n", n, body.first(10), body.last(10));
    
        auto checksum = reduce(begin(body), end(body), '\0', std::bit_xor<>{});
        fmt::print("{} body checksum: {:#0x}\n", __FUNCTION__, checksum);
    
        // return with string_body:
        return res;
    }
    
    http::response<http::string_body> using_buffer_body() {
        tcp::socket conn = send_get();
    
        http::response_parser<http::buffer_body> p;
        auto& res      = p.get(); // convenience shorthands
        auto& body_val = res.body();
    
        beast::flat_buffer buf;
        error_code ec;
        read_header(conn, buf, p, ec);
        //read(conn, buf, p, ec);
    
        if (ec && ec != http::error::need_buffer) // expected
            throw boost::system::system_error(ec);
    
        assert(p.is_header_done());
        std::cerr << "\n---\nresponse headers: " << res.base() << std::endl;
    
        char unsigned checksum = 0;
        size_t n = 0;
    
        std::string full_body;
        while (!p.is_done()) {
            std::array<char, 512> block;
            body_val.data = block.data();
            body_val.size = block.size();
            read(conn, buf, p, ec);
    
            if (ec && ec != http::error::need_buffer) // expected
                throw boost::system::system_error(ec);
    
            size_t curr = block.size() - body_val.size;
            n += curr;
    
            std::cerr << "[incrementally parsed " << curr << " body bytes, processing]\n";
    
            full_body.append(block.data(), curr);
            for (auto b : std::span(block).first(curr))
                checksum ^= b;
        }
    
        std::span body = full_body;
        fmt::print("body, {} bytes: {::0x} ... {::0x}\n", n, body.first(10), body.last(10));
    
        fmt::print("body, {} bytes streaming decoded, chunked? {}\n", n, p.chunked());
        fmt::print("{} body checksum: {:#0x}\n", __FUNCTION__, checksum);
    
        // return with string_body:
        return http::response<http::string_body>{std::move(res).base(), std::move(full_body)};
    }
    
    int main() {
        http::response<http::string_body> //
            a = using_string_body(),      //
            b = using_buffer_body();
    
        fmt::print("a == b: body {} headers {}\n", a.body() == b.body(),
                   boost::lexical_cast<std::string>(a.base()) == boost::lexical_cast<std::string>(b.base()));
    }
    

    Note that sometimes the headers are different because they contain a timestamp. Often, the timestamp are "the same second" though:

    enter image description here