c++boost-spiritboost-spirit-qi

Update a parser to admit parentheses within quoted strings


I need to update a parser to admit these new features, but I am not able to manage all them at a time:

(It easier to understand these requirements looking at source code example)

My current code, including checks, is as follows:

Godbolt link: https://godbolt.org/z/5d6o53n9h

#include <boost/fusion/adapted/struct/adapt_struct.hpp>
#include <boost/spirit/include/qi.hpp>

namespace script
{
    struct Command
    {
        enum Type { NONE, WRITE_LOG, INSERT_LABEL, START_PROCESS, END_PROCESS, COMMENT, FAIL };

        Type type{ Type::NONE };
        std::vector<std::string> args;
    };

    using Commands = std::vector<Command>;
}//namespace script

BOOST_FUSION_ADAPT_STRUCT(script::Command, type, args)

namespace script
{
    namespace qi = boost::spirit::qi;

    template <typename It>
    class Parser : public qi::grammar<It, Commands()>
    {
    private:
        qi::symbols<char, Command::Type> type;
        qi::rule<It, Command(), qi::blank_type> none, command, comment, fail;//By its very nature "fail" must be the last one to be checked
        qi::rule<It, Commands()> start;

    public:
        Parser() : Parser::base_type(start)
        {
            using namespace qi;//NOTE: "as_string" is neccessary in all args due to std::vector<std::string>
            auto empty_args = copy(attr(std::vector<std::string>{}));

            type.add
                ("WriteLog", Command::WRITE_LOG)
                ("InsertLabel", Command::INSERT_LABEL)
                ("StartProcess", Command::START_PROCESS)
                ("EndProcess", Command::END_PROCESS);

            none = omit[*blank] >> &(eol | eoi)
                >> attr(Command::NONE)
                >> empty_args;//ignore args

            command = type >> '('
                >> as_string[lexeme[+~char_("(),\r\n")]] % ',' >> ')';

            comment = lit("//")
                >> attr(Command::COMMENT)
                >> as_string[lexeme[*~char_("\r\n")]];

            fail = omit[*~char_("\r\n")]
                >> attr(Command::FAIL)
                >> empty_args;//ignore args

            start = skip(blank)[(none | command | comment | fail) % eol] >> eoi;
        }
    };

    Commands parse(std::istream& in)
    {
        using It = boost::spirit::istream_iterator;
        static const Parser<It> parser;

        Commands commands;

        It first(in >> std::noskipws), last;//No white space skipping
        if (!qi::parse(first, last, parser, commands))
            throw std::runtime_error("command parse error");

        return commands;
    }
}//namespace script

std::stringstream ss{
R"(// just a comment

WriteLog("this is a log")
WriteLog("this is also (in another way) a log")
WriteLog("but this is just a fail)

StartProcess(17, "program.exe", True)
StartProcess(17, "this_is_a_fail.exe, True)
)"};

int main()
{
    using namespace script;

    try
    {
        auto commands = script::parse(ss);

        std::array args{ 0, 0, 1, 1, -1, 0, 3, -1, 0 };//Fails may have any number of arguments. It doesn't care. Sets as -1 by convenience flag
        std::array types{ Command::COMMENT, Command::NONE, Command::WRITE_LOG, Command::WRITE_LOG, Command::FAIL, Command::NONE, Command::START_PROCESS, Command::FAIL, Command::NONE };
        std::cout << std::boolalpha << "size correct? " << (commands.size() == 9) << std::endl;
        std::cout << "types correct? " << std::equal(commands.begin(), commands.end(), types.begin(), types.end(), [](auto& cmd, auto& type) { return cmd.type == type; }) << std::endl;
        std::cout << "arguments correct? " << std::equal(commands.begin(), commands.end(), args.begin(), args.end(), [](auto& cmd, auto arg) { return cmd.args.size() == arg || arg == -1; }) << std::endl;
    }
    catch (std::exception const& e)
    {
        std::cout << e.what() << "\n";
    }
}

Any help with this will be appreciated.


Solution

  • You say you want to allow parentheses within quoted strings. But you don't even support quoted strings!

    So the problem is your argument rule. Which doesn't even exist. It whould be roughly this part:

    argument = +~char_("(),\r\n");
    command = type >> '(' >> argument % ',' >> ')';
    

    Where argument might be declared as

    qi::rule<It, Argument()> argument;
    

    In fact, rewriting the tests in an organized fashion, here's what we get right now:

    Live On Compiler Explorer

    static const Commands expected{
        {Command::COMMENT, {"just a comment"}},
        {Command::NONE, {}},
        {Command::WRITE_LOG, {"this is a log"}},
        {Command::WRITE_LOG, {"this is also (in another way) a log"}},
        {Command::FAIL, {}},
        {Command::NONE, {}},
        {Command::START_PROCESS, {"17", "program.exe", "True"}},
        {Command::FAIL, {}},
        {Command::NONE, {}},
    };
    
    try {
        auto parsed = script::parse(ss);
        fmt::print("Parsed all correct? {} -- {} parsed (vs. {} expected)\n",
                   (parsed == expected), parsed.size(), expected.size());
    
        for (auto i = 0u; i < std::min(expected.size(), parsed.size()); ++i) {
            if (expected[i] != parsed[i]) {
                fmt::print("index #{} expected {}\n"
                           "          actual:  {}\n",
                           i, expected[i], parsed[i]);
            } else {
                fmt::print("index #{} CORRECT ({})\n", i, parsed[i]);
            }
        }
    } catch (std::exception const& e) {
        fmt::print("Exception: {}\n", e.what());
    }
    

    Prints

    Parsed all correct? false -- 9 parsed (vs. 9 expected)
    index #0 CORRECT (Command(COMMENT, ["just a comment"]))
    index #1 CORRECT (Command(NONE, []))
    index #2 expected Command(WRITE_LOG, ["this is a log"])
              actual:  Command(WRITE_LOG, ["\"this is a log\""])
    index #3 expected Command(WRITE_LOG, ["this is also (in another way) a log"])
              actual:  Command(FAIL, [])
    index #4 expected Command(FAIL, [])
              actual:  Command(WRITE_LOG, ["\"but this is just a fail"])
    index #5 CORRECT (Command(NONE, []))
    index #6 expected Command(START_PROCESS, ["17", "program.exe", "True"])
              actual:  Command(START_PROCESS, ["17", "\"program.exe\"", "True"])
    index #7 expected Command(FAIL, [])
              actual:  Command(START_PROCESS, ["17", "\"this_is_a_fail.exe", "True"])
    index #8 CORRECT (Command(NONE, []))
    

    As you can see, it fails quoted strings too, in my expectation. That's because the quoting is a language construct. In the AST (parsed results) you donot care about how exactly it was written in code. E.g. "hello\ world\041" might be equivalent too "hello world!" so both should result in the argument value hello world!.

    So, let's do as we say:

    argument = quoted_string | number | boolean | raw_string;
    

    We can add a few rules:

    // notice these are lexemes (no internal skipping):
    qi::rule<It, Argument()> argument, quoted_string, number, boolean, raw_string;
    

    And define them:

    quoted_string = '"' >> *~char_('"') >> '"';
    number        = raw[double_];
    boolean       = raw[bool_];
    raw_string    = +~char_("(),\r\n");
    argument      = quoted_string | number | boolean | raw_string;
    

    (If you want to allow escaped quotes, something like this:

     quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';
    

    Now, I'd say you probably want Argument to be something like variant<double, std::string, bool>, instead of just std::string.

    With only this change, all the problems have practically vanished: Live On Compiler Explorer:

    Parsed all correct? false -- 9 parsed (vs. 9 expected)
    index #0 CORRECT (Command(COMMENT, ["just a comment"]))
    index #1 CORRECT (Command(NONE, []))
    index #2 CORRECT (Command(WRITE_LOG, ["this is a log"]))
    index #3 CORRECT (Command(WRITE_LOG, ["this is also (in another way) a log"]))
    index #4 CORRECT (Command(FAIL, []))
    index #5 CORRECT (Command(NONE, []))
    index #6 CORRECT (Command(START_PROCESS, ["17", "program.exe", "True"]))
    index #7 expected Command(FAIL, [])
              actual:  Command(START_PROCESS, ["17", "this_is_a_fail.exe, True)\n\"this_is_a_fail.exe", "True"])
    index #8 CORRECT (Command(NONE, []))
    

    Now, index #7 looks very funky, but it's actually a well-known phenomenon in Spirit¹. Enabling BOOST_SPIRIT_DEBUG demonstrates it:

      <argument>
        <try>"this_is_a_fail.exe,</try>
        <quoted_string>
          <try>"this_is_a_fail.exe,</try>
          <fail/>
        </quoted_string>
        <number>
          <try>"this_is_a_fail.exe,</try>
          <fail/>
        </number>
        <boolean>
          <try>"this_is_a_fail.exe,</try>
          <fail/>
        </boolean>
        <raw_string>
          <try>"this_is_a_fail.exe,</try>
          <success>, True)</success>
          <attributes>[[t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e, ,,  , T, r, u, e, ), ", t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e]]</attributes>
        </raw_string>
        <success>, True)</success>
        <attributes>[[t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e, ,,  , T, r, u, e, ), ", t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e]]</attributes>
      </argument>
    

    So, the string gets accepted as a raw string, even though it started with ". That's easily fixed, but we don't even need to. We could just apply qi::hold to avoid the duplication:

    argument = qi::hold[quoted_string] | number | boolean | raw_string;
    

    Result:

    actual:  Command(START_PROCESS, ["17", "\"this_is_a_fail.exe", "True"])
    

    However, if you expect it to fail, fix that other problem:

    raw_string    = +~char_("\"(),\r\n"); // note the \"
    

    Note: In the off-chance you really only require it to not start with a quote:

    raw_string    = !lit('"') >> +~char_("(),\r\n");
    

    I guess by now you see the problem with a "loose rule" like that, so I don't recommend it.

    You could express the requirement another way though, saying "if an argument starts with '"' then is MUST be a quoted_string. Use an expectation point there:

    quoted_string = '"' > *('\\' >> char_ | ~char_('"')) > '"';
    

    This has the effect that failure to parse a complete quoted_string will throw an expectation_failed exception.

    Summary / Listing

    This is what we end up with:

    Live On Compiler Explorer

    //#define BOOST_SPIRIT_DEBUG
    #include <boost/fusion/adapted/struct/adapt_struct.hpp>
    #include <boost/spirit/include/qi.hpp>
    #include <fmt/ranges.h>
    
    namespace script {
        using Argument = std::string;
        using Arguments = std::vector<Argument>;
    
        struct Command {
            enum Type {
                NONE,
                WRITE_LOG,
                INSERT_LABEL,
                START_PROCESS,
                END_PROCESS,
                COMMENT,
                FAIL
            };
    
            Type      type{Type::NONE};
            Arguments args;
    
            auto operator<=>(Command const&) const = default;
        };
    
        using Commands = std::vector<Command>;
    } // namespace script
    
    BOOST_FUSION_ADAPT_STRUCT(script::Command, type, args)
    
    namespace script {
        namespace qi = boost::spirit::qi;
    
        template <typename It> class Parser : public qi::grammar<It, Commands()> {
        public:
            Parser() : Parser::base_type(start) {
                using namespace qi; // NOTE: "as_string" is neccessary in all args
                auto empty_args = copy(attr(Arguments{}));
    
                type.add //
                    ("WriteLog",     Command::WRITE_LOG)     //
                    ("InsertLabel",  Command::INSERT_LABEL)  //
                    ("StartProcess", Command::START_PROCESS) //
                    ("EndProcess",   Command::END_PROCESS);  //
    
                none = omit[*blank] >> &(eol | eoi) //
                    >> attr(Command{Command::NONE, {}});
    
                quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';
                number        = raw[double_];
                boolean       = raw[bool_];
                raw_string    = +~char_("\"(),\r\n");
                argument = qi::hold[quoted_string] | number | boolean | raw_string;
    
                command = type >> '(' >> argument % ',' >> ')';
    
                comment = "//"                             //
                    >> attr(Command::COMMENT)              //
                    >> as_string[lexeme[*~char_("\r\n")]]; //
    
                fail = omit[*~char_("\r\n")] >> attr(Command{Command::FAIL, {}});
    
                line  = none | command | comment | fail; // keep fail last
                start = skip(blank)[line % eol] >> eoi;
    
                BOOST_SPIRIT_DEBUG_NODES((start)(line)(fail)(comment)(command)(
                    argument)(none)(quoted_string)(raw_string)(boolean)(number))
            }
    
        private:
            qi::symbols<char, Command::Type>         type;
            qi::rule<It, Command(), qi::blank_type>  line, none, command, comment, fail;
            // notice these are lexemes (no internal skipping):
            qi::rule<It, Argument()> argument, quoted_string, number, boolean, raw_string;
            qi::rule<It, Commands()> start;
        };
    
        Commands parse(std::istream& in)
        {
            using It = boost::spirit::istream_iterator;
            static const Parser<It> parser;
    
            Commands commands;
    
            return qi::parse(It{in >> std::noskipws}, {}, parser, commands)
                ? commands
                : throw std::runtime_error("command parse error");
        }
    
        struct Formatter {
            static constexpr auto name(script::Command::Type type) {
                return std::array{"NONE",          "WRITE_LOG",   "INSERT_LABEL",
                                "START_PROCESS", "END_PROCESS", "COMMENT",
                                "FAIL"}
                    .at(static_cast<int>(type));
            }
    
            auto parse(auto& ctx) const { return ctx.begin(); }
            auto format(script::Command const& cmd, auto& ctx) const {
                return format_to(ctx.out(), "Command({}, {})", name(cmd.type), cmd.args);
            }
        };
    } // namespace script
    
    template <> struct fmt::formatter<script::Command> : script::Formatter {};
    
    std::stringstream ss{
        R"(// just a comment
    
        WriteLog("this is a log")
        WriteLog("this is also (in another way) a log")
        WriteLog("but this is just a fail)
    
        StartProcess(17, "program.exe", True)
        StartProcess(17, "this_is_a_fail.exe, True)
        )"};
    
    int main() {
        using namespace script;
        static const Commands expected{
            {Command::COMMENT, {"just a comment"}},
            {Command::NONE, {}},
            {Command::WRITE_LOG, {"this is a log"}},
            {Command::WRITE_LOG, {"this is also (in another way) a log"}},
            {Command::FAIL, {}},
            {Command::NONE, {}},
            {Command::START_PROCESS, {"17", "program.exe", "True"}},
            {Command::FAIL, {}},
            {Command::NONE, {}},
        };
    
        try {
            auto parsed = script::parse(ss);
            fmt::print("Parsed all correct? {} -- {} parsed (vs. {} expected)\n",
                    (parsed == expected), parsed.size(), expected.size());
    
            for (auto i = 0u; i < std::min(expected.size(), parsed.size()); ++i) {
                if (expected[i] != parsed[i]) {
                    fmt::print("index #{} expected {}\n"
                            "          actual:  {}\n",
                            i, expected[i], parsed[i]);
                } else {
                    fmt::print("index #{} CORRECT ({})\n", i, parsed[i]);
                }
            }
        } catch (std::exception const& e) {
            fmt::print("Exception: {}\n", e.what());
        }
    }
    

    Prints

    Parsed all correct? true -- 9 parsed (vs. 9 expected)
    index #0 CORRECT (Command(COMMENT, ["just a comment"]))
    index #1 CORRECT (Command(NONE, []))
    index #2 CORRECT (Command(WRITE_LOG, ["this is a log"]))
    index #3 CORRECT (Command(WRITE_LOG, ["this is also (in another way) a log"]))
    index #4 CORRECT (Command(FAIL, []))
    index #5 CORRECT (Command(NONE, []))
    index #6 CORRECT (Command(START_PROCESS, ["17", "program.exe", "True"]))
    index #7 CORRECT (Command(FAIL, []))
    index #8 CORRECT (Command(NONE, []))
    

    ¹ see e.g. boost::spirit alternative parsers return duplicates (which links to three more of the same kind)