c++boost-spirit

How to solve ambiguous ruling?


Disclaimer: this is an slice of an bigger parser - reduced down to just explain my problem

i got the situation that i can haven 3 types of Identifier-starting rhs value - the . and : is only an example for rules starting with an identifier and then differ someway - also the struct copies are just for this example

so it can be just id, id.id or id:id the problem is that the id rule comes first and dictates what is needed at that point - do i need to combine my rules into one to get the different outputs? until now i though that the 3 rules starting with id will be handle as some sort of rule-composit - but it don't seem so

Live on Coliru

// #define BOOST_SPIRIT_DEBUG
#include <boost/phoenix.hpp>
#include <boost/spirit/include/qi.hpp>
#include <iomanip>
#include <variant>
namespace qi = boost::spirit::qi;

namespace Ast
{
    struct Identifier : std::string
    {
    };
    struct QualifiedId1
    {
        Identifier a, b;
    };
    struct QualifiedId2
    {
        Identifier a, b;
    };

    using Variants = std::variant<Identifier, QualifiedId1, QualifiedId2>;

} // namespace Ast

BOOST_FUSION_ADAPT_STRUCT( Ast::QualifiedId1, a, b )
BOOST_FUSION_ADAPT_STRUCT( Ast::QualifiedId2, a, b )

using It = std::string_view::const_iterator;

template <typename Attr, typename Skipper>
void run_tests( qi::rule<It, Attr(), Skipper> const& rule_, std::vector<std::string> const& tests )
{
    std::cout << "======[ " << boost::core::demangle( typeid( Attr ).name() ) << " ]======" << std::endl;
    for( std::string_view test : tests )
    {
        It f = test.begin(), l = test.end();
        try
        {
            std::cout << "Parsing: " << quoted( test );
            Attr v;
            bool ok;

            if constexpr( std::is_same_v<Skipper, qi::unused_type> )
                ok = qi::parse( f, l, qi::eps > rule_ > qi::eoi, v );
            else
                ok = qi::phrase_parse( f, l, qi::eps > rule_ > qi::eoi, Skipper{}, v );

            std::cout << ( ok ? "OK" : "FAIL" ) << " Remaining: " << quoted( std::string( f, l ) ) << "\n";
        }
        catch( qi::expectation_failure<It> const& ef )
        {
            auto p = ef.first - test.begin();
            auto bol = test.find_last_of( "\r\n", p ) + 1;
            auto line = std::count( f, f + bol, '\n' ) + 1;
            auto eol = test.find_first_of( "\r\n", p );

            std::cout << " -> EXPECTED " << ef.what_ << " in line:" << line << " col:" << ( p - bol ) << "\n"
                      << "    " << test.substr( bol, eol - bol ) << "\n"
                      << "    " << std::setw( p - bol ) << "" << "^--- here\n";
        }
        std::cout << "--------------------\n";
    }
}

int main()
{
    qi::rule<It, Ast::Identifier()> id = qi::char_( "a-zA-Z_" ) >> *qi::char_( "a-zA-Z0-9_" );
    qi::rule<It, Ast::QualifiedId1(), qi::space_type> qualId1 = id > "." > id;
    qi::rule<It, Ast::QualifiedId2(), qi::space_type> qualId2 = id > ":" > id;
    qi::rule<It, Ast::Variants(), qi::space_type> variants = ( id ) | ( qualId1 ) | ( qualId1 );

    run_tests( variants, { "abc", "abc.def", "abc:def" } );
}

Output - the id rule gets handled first and then the expectations fail

======[ class std::variant<struct Ast::Identifier,struct Ast::QualifiedId1,struct Ast::QualifiedId2> ]======
Parsing: "abc"OK Remaining: ""
--------------------
Parsing: "abc.def" -> EXPECTED <eoi> in line:1 col:3
    abc.def
       ^--- here
--------------------
Parsing: "abc:def" -> EXPECTED <eoi> in line:1 col:3
    abc:def
       ^--- here
--------------------

i tried to combine the 3 rules in something like

qi::rule<It, Ast::Variants(), qi::space_type> variants2 = id > *( ( "." > id ) | ( ":" > id ) );

that parses but only give the id type for all 3 in the variant the following id is only a example normally there are different rules behind (like only numbers or a fixed string etc.) - so not only . and : as difference


Solution

  • PEG grammars are left-to-right greedy. (a | ab) will never parse ab.

    Also, expectation points fail the surrounding rule, not just the containing expression. If backtracking is required, expectation points should not be used.

    Fixing these things:

    Live On Coliru

    // #define BOOST_SPIRIT_DEBUG
    #include <boost/phoenix.hpp>
    #include <boost/spirit/include/qi.hpp>
    #include <iomanip>
    #include <variant>
    namespace qi = boost::spirit::qi;
    
    namespace Ast {
        struct Identifier : std::string {
            using std::string::string;
            using std::string::operator=;
        };
        struct QualifiedId1 { Identifier a, b; };
        struct QualifiedId2 { Identifier a, b; };
    
        using Variants = std::variant<Identifier, QualifiedId1, QualifiedId2>;
    } // namespace Ast
    
    BOOST_FUSION_ADAPT_STRUCT(Ast::QualifiedId1, a, b)
    BOOST_FUSION_ADAPT_STRUCT(Ast::QualifiedId2, a, b)
    
    using It = std::string_view::const_iterator;
    
    template <typename Attr, typename Skipper>
    void run_tests(qi::rule<It, Attr(), Skipper> const& rule_, std::vector<std::string> const& tests) {
        using boost::core::demangle;
        std::cout << "======[ " << demangle(typeid(Attr).name()) << " ]======" << std::endl;
        for (std::string_view test : tests) {
            It f = test.begin(), l = test.end();
            try {
                std::cout << "Parsing: " << quoted(test) << " -> ";
                Attr v;
                bool ok;
    
                if constexpr (std::is_same_v<Skipper, qi::unused_type>)
                    ok = parse(f, l, qi::eps > rule_ > qi::eoi, v);
                else
                    ok = phrase_parse(f, l, qi::eps > rule_ > qi::eoi, Skipper{}, v);
    
                std::cout << (ok ? "OK" : "FAIL")                        //
                          << " Remaining: " << quoted(std::string(f, l)) //
                          << " Variant #" << v.index() << "\n";
            } catch (qi::expectation_failure<It> const& ef) {
                auto p = ef.first - test.begin();
                auto bol = test.find_last_of( "\r\n", p ) + 1;
                auto line = std::count( f, f + bol, '\n' ) + 1;
                auto eol = test.find_first_of( "\r\n", p );
    
                std::cout << "EXPECTED " << ef.what_ << " in line:" << line << " col:" << ( p - bol ) << "\n"
                          << "    " << test.substr( bol, eol - bol ) << "\n"
                          << "    " << std::setw( p - bol ) << "" << "^--- here\n";
            }
        }
    }
    
    template <typename Attr> using Rule = qi::rule<It, Attr(), qi::space_type>;
    
    int main() {
        qi::rule<It, Ast::Identifier()> id = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z0-9_");
        Rule<Ast::QualifiedId1>         qualId1  = id >> ("." > id);
        Rule<Ast::QualifiedId2>         qualId2  = id >> (":" > id);
        Rule<Ast::Variants>             variants = qualId1 | qualId2 | id;
    
        run_tests( variants, { "abc", "abc.def", "abc:def" } );
    }
    

    Prints

    ======[ std::variant<Ast::Identifier, Ast::QualifiedId1, Ast::QualifiedId2> ]======
    Parsing: "abc" -> OK Remaining: "" Variant #0
    Parsing: "abc.def" -> OK Remaining: "" Variant #1
    Parsing: "abc:def" -> OK Remaining: "" Variant #2
    

    Other Approaches

    Sometimes you can mix lookahead assertions to allow for more stringent expectations:

    keywordXYZ >> 
       (!qualId2 >> id) // will not parse qualid2
    

    In general, if b matches a subset of a the tricky case (a | bc) can be made to work with (a - b | bc), or equivalently (!b >> a | bc)

    BONUS Recursive ASTs

    In practice this kind of grammar often appears with expression parsing (where this expression denotes namespace/object member access). Then usually the syntax is recursive:

    using IdExpr = boost::variant<         //
        Id,                                //        boost::recursive_wrapper<QualId1>, //
        boost::recursive_wrapper<QualId2>>;
    

    In that case you can often avoid backtracking and incrementally build the attribute using semantic actions:

    Live On Coliru

    // #define BOOST_SPIRIT_DEBUG
    #include <boost/phoenix.hpp>
    #include <boost/spirit/include/qi.hpp>
    #include <iomanip>
    namespace qi = boost::spirit::qi;
    
    namespace Ast {
        struct Id : std::string {
            using std::string::string;
            using std::string::operator=;
        };
        struct QualId1;
        struct QualId2;
    
        using IdExpr = boost::variant<              //
            Id,                                     //
            boost::recursive_wrapper<QualId1>, //
            boost::recursive_wrapper<QualId2>>;
    
        struct QualId1 { IdExpr a, b; };
        struct QualId2 { IdExpr a, b; };
        
        // for demo output
        std::ostream& operator<<(std::ostream& os, Id const& id) { return os << "Id(" << quoted(id) << ")"; }
        std::ostream& operator<<(std::ostream& os, QualId1 const& id) { return os << "QualId1(" << id.a << "." << id.b << ")"; }
        std::ostream& operator<<(std::ostream& os, QualId2 const& id) { return os << "QualId2(" << id.a << ":" << id.b << ")"; }
    } // namespace Ast
    
    using It = std::string_view::const_iterator;
    
    template <typename Attr, typename Skipper>
    void run_tests(qi::rule<It, Attr(), Skipper> const& rule_, std::vector<std::string> const& tests) {
        using boost::core::demangle;
        std::cout << "======[ " << demangle(typeid(Attr).name()) << " ]======" << std::endl;
        for (std::string_view test : tests) {
            It f = test.begin(), l = test.end();
            try {
                std::cout << "Parsing: " << quoted(test) << " -> ";
                Attr v;
                bool ok;
    
                if constexpr (std::is_same_v<Skipper, qi::unused_type>)
                    ok = parse(f, l, qi::eps > rule_ > qi::eoi, v);
                else
                    ok = phrase_parse(f, l, qi::eps > rule_ > qi::eoi, Skipper{}, v);
    
                std::cout << (ok ? "OK" : "FAIL") //
                          << " Variant #" << v.which() << "\n -> " << v << "\n";
            } catch (qi::expectation_failure<It> const& ef) {
                auto p = ef.first - test.begin();
                auto bol = test.find_last_of( "\r\n", p ) + 1;
                auto line = std::count( f, f + bol, '\n' ) + 1;
                auto eol = test.find_first_of( "\r\n", p );
    
                std::cout << "EXPECTED " << ef.what_ << " in line:" << line << " col:" << ( p - bol ) << "\n"
                          << "    " << test.substr( bol, eol - bol ) << "\n"
                          << "    " << std::setw( p - bol ) << "" << "^--- here\n";
            }
        }
    }
    
    template <typename Attr> using Rule = qi::rule<It, Attr(), qi::space_type>;
    
    int main() {
        using namespace qi::labels;
        using boost::phoenix::construct;
        qi::rule<It, Ast::Id()> id = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z0-9_");
        qi::rule<It, Ast::IdExpr(), qi::space_type> expr;
        expr = id[_val = _1] //
            >> *(('.' >> expr)[_val = construct<Ast::QualId1>(_val, _1)] |
                 (':' >> expr)[_val = construct<Ast::QualId2>(_val, _1)]);
    
        run_tests(expr,
                  {
                      "abc",
                      "abc.def",
                      "abc:def",
                      "abc.def.ghi",
                      "abc:def:ghi",
                      "abc:def.ghi:jkl",
                      "abc:def:ghi:jkl",
                  });
    }
    

    Printing

    ======[ boost::variant<Ast::Id, boost::recursive_wrapper<Ast::QualId1>, boost::recursive_wrapper<Ast::QualId2> > ]======
    Parsing: "abc" -> OK Variant #0
     -> Id("abc")
    Parsing: "abc.def" -> OK Variant #1
     -> QualId1(Id("abc").Id("def"))
    Parsing: "abc:def" -> OK Variant #2
     -> QualId2(Id("abc"):Id("def"))
    Parsing: "abc.def.ghi" -> OK Variant #1
     -> QualId1(Id("abc").QualId1(Id("def").Id("ghi")))
    Parsing: "abc:def:ghi" -> OK Variant #2
     -> QualId2(Id("abc"):QualId2(Id("def"):Id("ghi")))
    Parsing: "abc:def.ghi:jkl" -> OK Variant #2
     -> QualId2(Id("abc"):QualId1(Id("def").QualId2(Id("ghi"):Id("jkl"))))
    Parsing: "abc:def:ghi:jkl" -> OK Variant #2
     -> QualId2(Id("abc"):QualId2(Id("def"):QualId2(Id("ghi"):Id("jkl"))))