c++urlboost-spiritboost-spirit-x3url-parsing

URL parsing using boost::spirit::x3


I'm trying parse and break-down URL into parts using boost::spirit::x3 as below:

#include <iostream>
#include <boost/fusion/adapted/std_tuple.hpp>
#include <boost/fusion/adapted/struct.hpp>
#include <boost/spirit/home/x3.hpp>

struct UrlParts { std::string prefix, host, suffix; };
BOOST_FUSION_ADAPT_STRUCT(UrlParts, prefix, host, suffix)

UrlParts parseSpirit(std::string_view input) {
    namespace x3 = boost::spirit::x3;

    static const auto scheme_    = (x3::raw[+x3::char_("a-zA-Z0-9+.-") >> "://"]);
    static const auto userinfo_  = (x3::raw[+~x3::char_("@") >> "@"]);
    static const auto prefix_    = (-scheme_ >> -userinfo_);
    static const auto port_      = (x3::raw[':' >> -x3::repeat(1, 5)[x3::digit] >> &(x3::char_("/?#") | x3::eoi)]);
    static const auto host_      = (+(x3::char_("a-fxXA-F0-9:.") - port_));
    static const auto path_      = (x3::char_("/?#") >> *x3::char_);    // to store path+query+fragment
    static const auto suffix_    = (-port_ >> -path_);

    //static const auto url = x3::rule<class url, UrlParts>() = -prefix_ >> ('[' >> host_ >> ']' | host_) >> -suffix_;
    static const auto url = -prefix_ >> ('[' >> host_ >> ']' | host_) >> -suffix_;    // prefix & suffix are optional but host is required

    // BOOST_SPIRIT_DEBUG_NODES((scheme_)(userinfo_)(host_)(port_)(path_)(url));

    // Parse the input
    auto iter = input.begin();
    auto end = input.end();
    UrlParts parts;
    auto attr = std::tie(parts.prefix, parts.host, parts.suffix);
    //parse(input.begin(), input.end(), x3::eps >> url >> x3::eoi, parts);
    bool ret = x3::parse(iter, end, url >> x3::eoi, attr);
    if (!ret) {
        std::cout << "Parsing failed" << std::endl;
    }
    return parts;
}

int main()
{
    for (auto input : {"http://usr:pwd@192.168.1.1:8080/file.php?abc=1#23",
                       "http://[::ffff:192.168.1.1]:8080/file.php?abc=1#23",
                       "http://::ffff:192.168.1.1/file.php?abc=1#23",
                       "::ffff:192.168.1.1"
                      }) {
        std::cout << "Input: " << input << std::endl;
        auto parts = parseSpirit("http://usr:pwd@192.168.1.1/file.php?abc=1");
        std::cout << "Output: Prefix: " << parts.prefix << ", Host: " << parts.host << ", Suffix: " << parts.suffix << std::endl;
        std::cout << "================" << std::endl;
    }
    return 0;
}

But above code fails to compile with error:

/usr/include/boost/spirit/home/x3/operator/detail/sequence.hpp:140:25: error: static assertion failed: Size of the passed attribute is less than expected.
  140 |             actual_size >= expected_size
      |             ~~~~~~~~~~~~^~~~~~~~~~~~~~~~
/usr/include/boost/spirit/home/x3/operator/detail/sequence.hpp:140:25: note: ‘(((int)boost::spirit::x3::detail::partition_attribute >, boost::spirit::x3::literal_string > > >, boost::spirit::x3::optional > >, boost::spirit::x3::literal_char > > > > >, boost::spirit::x3::alternative, boost::spirit::x3::plus, boost::spirit::x3::raw_directive, boost::spirit::x3::optional, boost::spirit::x3::detail::finite_count > > >, boost::spirit::x3::and_predicate, boost::spirit::x3::eoi_parser> > > > > > >, boost::spirit::x3::literal_char >, boost::spirit::x3::plus, boost::spirit::x3::raw_directive, boost::spirit::x3::optional, boost::spirit::x3::detail::finite_count > > >, boost::spirit::x3::and_predicate, boost::spirit::x3::eoi_parser> > > > > > > >, boost::spirit::x3::optional, boost::spirit::x3::optional, boost::spirit::x3::detail::finite_count > > >, boost::spirit::x3::and_predicate, boost::spirit::x3::eoi_parser> > > > >, boost::spirit::x3::optional, boost::spirit::x3::kleene > > > > >, std::tuple, std::allocator >&, std::__cxx11::basic_string, std::allocator >&, std::__cxx11::basic_string, std::allocator >&>, boost::spirit::x3::unused_type, void>::actual_size) >= ((int)boost::spirit::x3::detail::partition_attribute >, boost::spirit::x3::literal_string > > >, boost::spirit::x3::optional > >, boost::spirit::x3::literal_char > > > > >, boost::spirit::x3::alternative, boost::spirit::x3::plus, boost::spirit::x3::raw_directive, boost::spirit::x3::optional, boost::spirit::x3::detail::finite_count > > >, boost::spirit::x3::and_predicate, boost::spirit::x3::eoi_parser> > > > > > >, boost::spirit::x3::literal_char >, boost::spirit::x3::plus, boost::spirit::x3::raw_directive, boost::spirit::x3::optional, boost::spirit::x3::detail::finite_count > > >, boost::spirit::x3::and_predicate, boost::spirit::x3::eoi_parser> > > > > > > >, boost::spirit::x3::optional, boost::spirit::x3::optional, boost::spirit::x3::detail::finite_count > > >, boost::spirit::x3::and_predicate, boost::spirit::x3::eoi_parser> > > > >, boost::spirit::x3::optional, boost::spirit::x3::kleene > > > > >, std::tuple, std::allocator >&, std::__cxx11::basic_string, std::allocator >&, std::__cxx11::basic_string, std::allocator >&>, boost::spirit::x3::unused_type, void>::expected_size))’ evaluates to false

Any suggestions on what's wrong ? Similar code with boost::spirit::qi works fine.
Additionally, I'm interested in learning if there is a more efficient way of doing this, eg: using string_view instead of string, since all the 3 parts are present in input view.
Thanks in advance!


Solution

  • It means the attributes are not detected as compatible for the parser expression. See it broken in 1.76:

    The first problem I see is trying to synthesize a single string (prefix) out of two raw[] directives. That's... not gonna work.

    The good news is it works (again?) starting with 1.77, but in general consider being a bit more explicit with the attribute compatibilty, like as_type from this answer Understanding the List Operator (%) in Boost.Spirit, or the many others https://stackoverflow.com/search?tab=newest&q=user%3a85371%20x3%20as_type&searchOn=3 or even more if you look for the name as which I usually prefer... https://stackoverflow.com/search?q=user%3A85371+x3+as+x3%3A%3Arule

    That said, when I look at your code, you bind a manually tied tuple AND somehow still adapt the struct? That's redundant. I'd assume you want to just tie manually, so drop the adaptation.

    Next up, by all means, don't use a parser combinator library as a tokenizer. I.e., don't randomly bunch together unrelated productions (suffix really should not contain the port specification).

    Also, parse into a real port number using... an integer parser. Certainly if you're going to be pedantic strict about the number of digits allowed anyways! See here:

    auto portnum_   = x3::uint_parser<uint16_t, 10, 1, 5>{};
    auto portspec_  = ':' >> portnum_ >> &(x3::char_("/?#") | x3::eoi);
    

    Be careful with double optionality. E.g. Since suffix_ = -port_ >> -path_ literally has only optional elements, the expression -suffix_ at best has the same meaning as suffix_. However, there are lots of situations (optional repeating constructs) where you will get infinite loops of zero-length matches.

    I suppose / or # should also end the user info production.

    Not everything needs to be raw. E.g. I'd prefer

    auto userinfo_  = +~x3::char_("@/#") >> x3::char_("@");
    

    for the userinfo. In fact, if you do

    auto authority_ = ('[' >> host_ >> ']' | host_) >> -portspec_;
    

    You will correctly get the parsed host without the [] brackets, which have meaning ONLY in the URI grammar.

    You had the end-of-input validation commented out, let's re-enable those. Let's also return more richt information (including valid flag) and a lot more test cases:

    Live On Compiler Explorer

    #include <boost/fusion/adapted/std_tuple.hpp>
    #include <boost/spirit/home/x3.hpp>
    #include <iostream>
    
    struct UrlParts { std::string prefix, host, suffix; };
    
    UrlParts parseSpirit(std::string_view input) {
        namespace x3 = boost::spirit::x3;
    
        auto scheme_   = x3::raw[+x3::char_("a-zA-Z0-9+.-") >> "://"];
        auto userinfo_ = x3::raw[+~x3::char_("@") >> "@"];
        auto prefix_   = scheme_ >> -userinfo_;
        auto port_     = x3::raw[':' >> -x3::repeat(1, 5)[x3::digit] >> &(x3::char_("/?#") | x3::eoi)];
        auto host_     = +(x3::char_("a-fxXA-F0-9:.") - port_);
        auto path_     = x3::char_("/?#") >> *x3::char_; // to store path+query+fragment
        auto suffix_   = -port_ >> -path_;
    
        // static const auto url =  = -prefix_ >> ('[' >> host_ >> ']' | host_) >>
        // -suffix_;
        auto url                              //
            //= x3::rule<class url, UrlParts>() //
            = -prefix_                        //
                                              //>> ('[' >> host_ >> ']' | host_) >>
            // -suffix_ // prefix & suffix are optional but host is required
            ;
    
        // Parse the input
        auto iter = input.begin();
        auto end = input.end();
        UrlParts p;
        auto attr = std::tie(p.prefix/*, p.host, p.suffix*/);
        bool ret = x3::parse(iter, end, url >> x3::eoi, attr);
        if (!ret) {
            std::cout << "Parsing failed" << std::endl;
        }
        return p;
    }
    
    int main() {
        for (auto input : {"http://usr:pwd@192.168.1.1:8080/file.php?abc=1#23",
                           "http://[::ffff:192.168.1.1]:8080/file.php?abc=1#23",
                           "http://::ffff:192.168.1.1/file.php?abc=1#23", "::ffff:192.168.1.1"}) {
            std::cout << "Input: " << input << std::endl;
            auto parts = parseSpirit("http://usr:pwd@192.168.1.1/file.php?abc=1");
            std::cout << "Output: Prefix: " << parts.prefix << ", Host: " << parts.host << ", Suffix: " << parts.suffix << std::endl;
            std::cout << "================" << std::endl;
        }
    }
    

    Printing

    192.168.1.1 {true, "", "", "192.168.1.1", "", unspecified}
    192.168.1.1/    {true, "", "", "192.168.1.1", "/", unspecified}
    192.168.1.1/file.php    {true, "", "", "192.168.1.1", "/file.php", unspecified}
    192.168.1.1/file.php?abc=1  {true, "", "", "192.168.1.1", "/file.php?abc=1", unspecified}
    192.168.1.1:8888    {true, "", "", "192.168.1.1", "", 8888}
    192.168.1.1:8888/   {true, "", "", "192.168.1.1", "/", 8888}
    192.168.1.1:8888/file.php   {true, "", "", "192.168.1.1", "/file.php", 8888}
    192.168.1.1:8888/file.php?abc=1 {true, "", "", "192.168.1.1", "/file.php?abc=1", 8888}
    ::ffffff::192.168.1.1:9999/file.php?abc=1   {true, "", "", "::ffffff::192.168.1.1", "/file.php?abc=1", 9999}
    http://192.168.1.1  {true, "http://", "", "192.168.1.1", "", unspecified}
    http://192.168.1.1/ {true, "http://", "", "192.168.1.1", "/", unspecified}
    http://192.168.1.1/file.php {true, "http://", "", "192.168.1.1", "/file.php", unspecified}
    http://192.168.1.1/file.php?abc=1   {true, "http://", "", "192.168.1.1", "/file.php?abc=1", unspecified}
    http://192.168.1.1:8888 {true, "http://", "", "192.168.1.1", "", 8888}
    http://192.168.1.1:8888/    {true, "http://", "", "192.168.1.1", "/", 8888}
    http://192.168.1.1:8888/file.php    {true, "http://", "", "192.168.1.1", "/file.php", 8888}
    http://192.168.1.1:8888/file.php?abc=1  {true, "http://", "", "192.168.1.1", "/file.php?abc=1", 8888}
    http://::ffffff::192.168.1.1:9999/file.php?abc=1    {true, "http://", "", "::ffffff::192.168.1.1", "/file.php?abc=1", 9999}
    http://sehe@192.168.1.1 {true, "http://", "sehe@", "192.168.1.1", "", unspecified}
    http://sehe@192.168.1.1/    {true, "http://", "sehe@", "192.168.1.1", "/", unspecified}
    http://sehe@192.168.1.1/file.php    {true, "http://", "sehe@", "192.168.1.1", "/file.php", unspecified}
    http://sehe@192.168.1.1:8888    {true, "http://", "sehe@", "192.168.1.1", "", 8888}
    http://sehe@192.168.1.1:8888/   {true, "http://", "sehe@", "192.168.1.1", "/", 8888}
    http://sehe@192.168.1.1:8888/file.php   {true, "http://", "sehe@", "192.168.1.1", "/file.php", 8888}
    http://usr:pwd@192.168.1.1/file.php?abc=1   {true, "http://", "usr:pwd@", "192.168.1.1", "/file.php?abc=1", unspecified}
    sehe@192.168.1.1    {true, "", "sehe@", "192.168.1.1", "", unspecified}
    sehe@192.168.1.1/   {true, "", "sehe@", "192.168.1.1", "/", unspecified}
    sehe@192.168.1.1/file.php   {true, "", "sehe@", "192.168.1.1", "/file.php", unspecified}
    sehe@192.168.1.1:8888   {true, "", "sehe@", "192.168.1.1", "", 8888}
    sehe@192.168.1.1:8888/  {true, "", "sehe@", "192.168.1.1", "/", 8888}
    sehe@192.168.1.1:8888/file.php  {true, "", "sehe@", "192.168.1.1", "/file.php", 8888}
    usr:pwd@192.168.1.1/file.php?abc=1  {true, "", "usr:pwd@", "192.168.1.1", "/file.php?abc=1", unspecified}
    

    BUT WAIT - THE PROBLEM?

    If you must use old Boost versions, we see the problem is back...

    So let's fix that by re-instating the raw[] around the user-info just as a hint. I'd rather upgrade Boost obviously:

    auto userinfo_  = x3::raw[+~x3::char_("@/#") >> x3::char_("@")];
    

    See it live on Boost 1.76 here: https://godbolt.org/z/6bc5Ycrcr

    The output is identical, md5 checksums:

    c69881691195579e3184ef6024136356  1.76
    c69881691195579e3184ef6024136356  1.84