c++grammarboost-spiritboost-spirit-qiboost-spirit-lex

Boost.Spirit SQL grammar/lexer failure


I have two problems with the following SQL grammar:

#define BOOST_SPIRIT_QI_DEBUG

#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/include/karma.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/fusion/include/std_pair.hpp> 

#include <boost/algorithm/string.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/make_shared.hpp>
#include <boost/lexical_cast.hpp>

#include <iostream>
#include <fstream>
#include <string>
#include <set>
#include <utility>

namespace bs = boost::spirit;
namespace lex = boost::spirit::lex;
namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;

// Token definition base, defines all tokens for the base grammar below
template <typename Lexer>
struct sql_tokens : lex::lexer<Lexer>
{
public:
    // Tokens with no attributes.
    lex::token_def<lex::omit> type_smallint, type_int, type_varchar, type_text, type_date;
    lex::token_def<lex::omit> kw_not_null, kw_auto_increment, kw_unique, kw_default, kw_create,
        kw_table, kw_constraint, kw_primary_key;

    // Attributed tokens. (If you add a new type, don't forget to add it to the lex::lexertl::token definition too).
    lex::token_def<int> signed_digit;
    lex::token_def<std::size_t> unsigned_digit;
    lex::token_def<std::string> identifier;
    lex::token_def<std::string> quoted_string;

    sql_tokens()
    {
        // Column data types.
        type_smallint = "(?i:smallint)";
        type_int = "(?i:int)";
        type_varchar = "(?i:varchar)";
        type_text = "(?i:text)";
        type_date = "(?i:date)";

        // Keywords.
        kw_not_null = "(?i:not +null)";
        kw_auto_increment = "(?i:auto_increment)";
        kw_unique = "(?i:unique)";
        kw_default = "(?i:default)";
        kw_create = "(?i:create)";
        kw_table = "(?i:table)";
        kw_constraint = "(?i:constraint)";
        kw_primary_key = "(?i:primary +key)";

        // Values.
        signed_digit = "[+-]?[0-9]+";
        unsigned_digit = "[0-9]+";
        quoted_string = "\\\"(\\\\.|[^\\\"])*\\\""; // \"(\\.|[^\"])*\"

        // Identifier.
        identifier = "[a-zA-Z][a-zA-Z0-9_]*";

        // The token must be added in priority order.
        this->self += lex::token_def<>('(') | ')' | ',' | ';';
        this->self += type_smallint | type_int | type_varchar | type_text |
                                    type_date;
        this->self += kw_not_null | kw_auto_increment | kw_unique | kw_default |
                                    kw_create | kw_table | kw_constraint | kw_primary_key;
        this->self += identifier | unsigned_digit | signed_digit | quoted_string;

        // define the whitespace to ignore.
        this->self("WS")
                =       lex::token_def<>("[ \\t\\n]+") 
                |       "--[^\\n]*\\n"  // Single line comments with --
                |       "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/" // C-style comments
                ;
    }
};

// Grammar definition, define a little part of the SQL language.
template <typename Iterator, typename Lexer>
struct sql_grammar 
    : qi::grammar<Iterator, qi::in_state_skipper<Lexer> >
{
    template <typename TokenDef>
    sql_grammar(TokenDef const& tok)
        : sql_grammar::base_type(program, "program")
    {
        program 
            =  (statement % ';') >> *qi::lit(';')
            ;

        statement 
            =   create_statement.alias()
            ;

        create_statement
            =   tok.kw_create >> create_table
            ;

        create_table
            =   tok.kw_table >> tok.identifier >> '(' >> create_table_columns >> -(',' >> table_constraints) >> ')'
            ;

        table_constraints
            =   constraint_definition % ','
            ;

        constraint_definition
            = tok.kw_constraint >> tok.identifier >> primary_key_constraint
            ;

        primary_key_constraint
            = tok.kw_primary_key >> '(' >> (tok.identifier % ',') >> ')'
            ;

        create_table_columns
            =   column_definition % ','
            ;

        column_definition
            =   tok.identifier >> column_type >> *type_constraint
            ;

        type_constraint
            =   tok.kw_not_null
            |   tok.kw_auto_increment
            |   tok.kw_unique
            |   default_value
            ;

        default_value
            =   tok.kw_default > tok.quoted_string
            ;

        column_type
            =   tok.type_smallint
            |   tok.type_int
            |   (tok.type_varchar > '(' > tok.unsigned_digit > ')') 
            |   tok.type_text
            |   tok.type_date
            ;

        program.name("program");
        statement.name("statement");
        create_statement.name("create statement");
        create_table.name("create table");
        create_table_columns.name("create table columns");
        column_definition.name("column definition");
        column_type.name("column type");
        default_value.name("default value");
        type_constraint.name("type constraint");
        table_constraints.name("table constraints");
        constraint_definition.name("constraint definition");
        primary_key_constraint.name("primary key constraint");

        BOOST_SPIRIT_DEBUG_NODE(program);
        BOOST_SPIRIT_DEBUG_NODE(statement);
        BOOST_SPIRIT_DEBUG_NODE(create_statement);
        BOOST_SPIRIT_DEBUG_NODE(create_table);
        BOOST_SPIRIT_DEBUG_NODE(create_table_columns);
        BOOST_SPIRIT_DEBUG_NODE(column_definition);
        BOOST_SPIRIT_DEBUG_NODE(column_type);
        BOOST_SPIRIT_DEBUG_NODE(default_value);
        BOOST_SPIRIT_DEBUG_NODE(type_constraint);
        BOOST_SPIRIT_DEBUG_NODE(table_constraints);
        BOOST_SPIRIT_DEBUG_NODE(constraint_definition);
        BOOST_SPIRIT_DEBUG_NODE(primary_key_constraint);

        using namespace qi::labels;
        qi::on_error<qi::fail>
        (
            program,
            std::cout
                << phx::val("Error! Expecting ")
                << bs::_4                               // what failed?
                << phx::val(" here: \"")
                << phx::construct<std::string>(bs::_3, bs::_2)   // iterators to error-pos, end
                << phx::val("\"")
                << std::endl
        );
    }

private:
    typedef qi::in_state_skipper<Lexer> skipper_type;
    typedef qi::rule<Iterator, skipper_type> simple_rule;

    simple_rule program, statement, create_statement, create_table, table_constraints, constraint_definition;
    simple_rule primary_key_constraint, create_table_columns, column_definition, type_constraint, default_value, column_type;
};

std::string file2string(const std::string& filename)
{
    std::ifstream s(filename.c_str(), std::ios_base::binary);
    std::stringstream ss;
    ss << s.rdbuf();
    return ss.str();
}

int main(int argc, char* argv[])
{
    if(argc != 2)
    {
        std::cerr << "usage: " << argv[0] << " schema_filename\n";
        return 1;
    }

    // iterator type used to expose the underlying input stream
    typedef std::string::iterator base_iterator_type;

    // This is the lexer token type to use.
    typedef lex::lexertl::token<
        base_iterator_type, boost::mpl::vector<int, std::size_t, std::string> 
    > token_type;

    // Here we use the lexertl based lexer engine.
    typedef lex::lexertl::lexer<token_type> lexer_type;

    // This is the token definition type (derived from the given lexer type).
    typedef sql_tokens<lexer_type> sql_tokens;

    // this is the iterator type exposed by the lexer 
    typedef sql_tokens::iterator_type iterator_type;

    // this is the type of the grammar to parse
    typedef sql_grammar<iterator_type, sql_tokens::lexer_def> sql_grammar;

    // now we use the types defined above to create the lexer and grammar
    // object instances needed to invoke the parsing process
    sql_tokens tokens;                         // Our lexer
    sql_grammar sql(tokens);                  // Our parser

    std::string str(file2string(argv[1]));

    // At this point we generate the iterator pair used to expose the
    // tokenized input stream.
    base_iterator_type it = str.begin();
    iterator_type iter = tokens.begin(it, str.end());
    iterator_type end = tokens.end();

    // Parsing is done based on the the token stream, not the character 
    // stream read from the input.
    // Note how we use the lexer defined above as the skip parser. It must
    // be explicitly wrapped inside a state directive, switching the lexer 
    // state for the duration of skipping whitespace.
    std::string ws("WS");
    bool r = qi::phrase_parse(iter, end, sql, qi::in_state(ws)[tokens.self]);

    if (r && iter == end)
    {
        std::cout << "-------------------------\n";
        std::cout << "Parsing succeeded\n";
        std::cout << "-------------------------\n";
    }
    else
    {
        std::cout << "-------------------------\n";
        std::cout << "Parsing failed\n";
        std::cout << "-------------------------\n";
    }
    return 0;
}

Problem 1: Start with comments

When the file start with a comment, the parsing immediately fails:

/* bouh */

CREATE TABLE mytable (
  id int NOT NULL AUTO_INCREMENT
);

With this failing tree:

<program>
  <try>[/]</try>
  <statement>
    <try>[/]</try>
    <create_statement>
      <try>[/]</try>
      <fail/>
    </create_statement>
    <fail/>
  </statement>
  <fail/>
</program>

But if I add a line return just before, it works. Both type of comments ("--" and "/**/") fail.

Problem 2: Keyword unique not recognized

The parsing fails under very specific condition with the keyword unique. It's not working when unique is in upper case and directly followed by a comma.

All the following cases succeed:

-- Success
CREATE TABLE Addon (
  id int NOT NULL AUTO_INCREMENT,
  u smallint NOT NULL UNIQUE
);

-- Success
CREATE TABLE Addon (
  id int NOT NULL AUTO_INCREMENT,
  u smallint NOT NULL unique,
  s int NOT NULL UNIQUE
);

-- Success
CREATE TABLE Addon (
  id int NOT NULL AUTO_INCREMENT,
  u smallint NOT NULL UNIQUE ,
  s int NOT NULL UNIQUE
);

-- Success
CREATE TABLE Addon (
  id int NOT NULL AUTO_INCREMENT,
  u smallint UNIQUE NOT NULL,
  s int NOT NULL UNIQUE
);

But this one doesn't:

-- Fail
CREATE TABLE Addon (
  id int NOT NULL AUTO_INCREMENT,
  u smallint NOT NULL UNIQUE,
  s int NOT NULL
);

Do you have any ideas of what is wrong? Thanks!


Solution

  • Regarding the whitespace skipping I can only conclude that pre-skipping is not being done initially (perhaps the state is not switched correctly).

    Of course, you could try to remedy this using the lex::tokenize_and_parse API (passing the initial state as "WS"). I misrembered the API, you could only do this with manual tokenization, which precludes the state switching by Qi in the first place.

    However, what I tend to do is make skipping the responsibility of the lexer:

    ws = "[ \\t\\n]+";
    comment = "--[^\\n]*\\n";  // Single line comments with --
    cstyle_comment = "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"; // C-style comments
    
    this->self += ws              [ lex::_pass = lex::pass_flags::pass_ignore ] 
                | comment         [ lex::_pass = lex::pass_flags::pass_ignore ]
                | cstyle_comment  [ lex::_pass = lex::pass_flags::pass_ignore ]
                ;
    

    Now there is no need to use a skipper at all, and this succeeds in parsing the first problem (starting with a comment).

    Full code: Live On Coliru

    Look for #ifdef STATE_WS

    //#define BOOST_SPIRIT_QI_DEBUG
    //#define STATE_WS
    
    #include <boost/spirit/include/qi.hpp>
    #include <boost/spirit/include/lex_lexertl.hpp>
    #include <boost/spirit/include/phoenix.hpp>
    #include <boost/spirit/include/karma.hpp>
    #include <boost/fusion/include/adapt_struct.hpp>
    #include <boost/fusion/include/std_pair.hpp> 
    
    #include <boost/algorithm/string.hpp>
    #include <boost/shared_ptr.hpp>
    #include <boost/make_shared.hpp>
    #include <boost/lexical_cast.hpp>
    
    #include <iostream>
    #include <fstream>
    #include <string>
    #include <set>
    #include <utility>
    
    namespace bs  = boost::spirit;
    namespace lex = boost::spirit::lex;
    namespace qi  = boost::spirit::qi;
    namespace phx = boost::phoenix;
    
    // Token definition base, defines all tokens for the base grammar below
    template <typename Lexer>
    struct sql_tokens : lex::lexer<Lexer>
    {
    public:
        // Tokens with no attributes.
        lex::token_def<lex::omit> type_smallint;
        lex::token_def<lex::omit> type_int;
        lex::token_def<lex::omit> type_varchar;
        lex::token_def<lex::omit> type_text;
        lex::token_def<lex::omit> type_date;
        lex::token_def<lex::omit> kw_not_null;
        lex::token_def<lex::omit> kw_auto_increment;
        lex::token_def<lex::omit> kw_unique;
        lex::token_def<lex::omit> kw_default;
        lex::token_def<lex::omit> kw_create;
        lex::token_def<lex::omit> kw_table;
        lex::token_def<lex::omit> kw_constraint;
        lex::token_def<lex::omit> kw_primary_key;
    
        // Attributed tokens. (If you add a new type, don't forget to add it to the lex::lexertl::token definition too).
        lex::token_def<int>         signed_digit;
        lex::token_def<std::size_t> unsigned_digit;
        lex::token_def<std::string> identifier;
        lex::token_def<std::string> quoted_string;
    
        lex::token_def<lex::omit>   ws, comment, cstyle_comment;
    
        sql_tokens()
        {
            // Column data types.
            type_smallint     = "(?i:smallint)";
            type_int          = "(?i:int)";
            type_varchar      = "(?i:varchar)";
            type_text         = "(?i:text)";
            type_date         = "(?i:date)";
    
            // Keywords.
            kw_not_null       = "(?i:not +null)";
            kw_auto_increment = "(?i:auto_increment)";
            kw_unique         = "(?i:unique)";
            kw_default        = "(?i:default)";
            kw_create         = "(?i:create)";
            kw_table          = "(?i:table)";
            kw_constraint     = "(?i:constraint)";
            kw_primary_key    = "(?i:primary +key)";
    
            // Values.
            signed_digit      = "[+-]?[0-9]+";
            unsigned_digit    = "[0-9]+";
            quoted_string     = "\\\"(\\\\.|[^\\\"])*\\\""; // \"(\\.|[^\"])*\"
    
            // Identifier.
            identifier        = "[a-zA-Z][a-zA-Z0-9_]*";
    
            // The token must be added in priority order.
            this->self += lex::token_def<>('(') | ')' | ',' | ';';
            this->self += type_smallint | type_int | type_varchar | type_text |
                                        type_date;
            this->self += kw_not_null | kw_auto_increment | kw_unique | kw_default |
                                        kw_create | kw_table | kw_constraint | kw_primary_key;
            this->self += identifier | unsigned_digit | signed_digit | quoted_string;
    
    #ifdef STATE_WS
            // define the whitespace to ignore.
            this->self("WS")
                    =       ws
                    |       comment
                    |       cstyle_comment
                    ;
    #else
            ws = "[ \\t\\n]+";
            comment = "--[^\\n]*\\n";  // Single line comments with --
            cstyle_comment = "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"; // C-style comments
    
            this->self += ws              [ lex::_pass = lex::pass_flags::pass_ignore ] 
                        | comment         [ lex::_pass = lex::pass_flags::pass_ignore ]
                        | cstyle_comment  [ lex::_pass = lex::pass_flags::pass_ignore ]
                        ;
    #endif
        }
    };
    
    // Grammar definition, define a little part of the SQL language.
    template <typename Iterator, typename Lexer>
    struct sql_grammar 
    #ifdef STATE_WS
        : qi::grammar<Iterator, qi::in_state_skipper<Lexer> >
    #else
        : qi::grammar<Iterator>
    #endif
    {
        template <typename TokenDef>
        sql_grammar(TokenDef const& tok)
            : sql_grammar::base_type(program, "program")
        {
            program 
                =  (statement % ';') >> *qi::lit(';')
                ;
    
            statement 
                =   create_statement.alias()
                ;
    
            create_statement
                =   tok.kw_create >> create_table
                ;
    
            create_table
                =   tok.kw_table >> tok.identifier >> '(' >> create_table_columns >> -(',' >> table_constraints) >> ')'
                ;
    
            table_constraints
                =   constraint_definition % ','
                ;
    
            constraint_definition
                = tok.kw_constraint >> tok.identifier >> primary_key_constraint
                ;
    
            primary_key_constraint
                = tok.kw_primary_key >> '(' >> (tok.identifier % ',') >> ')'
                ;
    
            create_table_columns
                =   column_definition % ','
                ;
    
            column_definition
                =   tok.identifier >> column_type >> *type_constraint
                ;
    
            type_constraint
                =   tok.kw_not_null
                |   tok.kw_auto_increment
                |   tok.kw_unique
                |   default_value
                ;
    
            default_value
                =   tok.kw_default > tok.quoted_string
                ;
    
            column_type
                =   tok.type_smallint
                |   tok.type_int
                |   (tok.type_varchar > '(' > tok.unsigned_digit > ')') 
                |   tok.type_text
                |   tok.type_date
                ;
    
            program.name("program");
            statement.name("statement");
            create_statement.name("create statement");
            create_table.name("create table");
            create_table_columns.name("create table columns");
            column_definition.name("column definition");
            column_type.name("column type");
            default_value.name("default value");
            type_constraint.name("type constraint");
            table_constraints.name("table constraints");
            constraint_definition.name("constraint definition");
            primary_key_constraint.name("primary key constraint");
    
            BOOST_SPIRIT_DEBUG_NODE(program);
            BOOST_SPIRIT_DEBUG_NODE(statement);
            BOOST_SPIRIT_DEBUG_NODE(create_statement);
            BOOST_SPIRIT_DEBUG_NODE(create_table);
            BOOST_SPIRIT_DEBUG_NODE(create_table_columns);
            BOOST_SPIRIT_DEBUG_NODE(column_definition);
            BOOST_SPIRIT_DEBUG_NODE(column_type);
            BOOST_SPIRIT_DEBUG_NODE(default_value);
            BOOST_SPIRIT_DEBUG_NODE(type_constraint);
            BOOST_SPIRIT_DEBUG_NODE(table_constraints);
            BOOST_SPIRIT_DEBUG_NODE(constraint_definition);
            BOOST_SPIRIT_DEBUG_NODE(primary_key_constraint);
    
            using namespace qi::labels;
            qi::on_error<qi::fail>
            (
                program,
                std::cout
                    << phx::val("Error! Expecting ")
                    << bs::_4                               // what failed?
                    << phx::val(" here: \"")
                    << phx::construct<std::string>(bs::_3, bs::_2)   // iterators to error-pos, end
                    << phx::val("\"")
                    << std::endl
            );
        }
    
    private:
    #ifdef STATE_WS
        typedef qi::in_state_skipper<Lexer> skipper_type;
    #else
        typedef qi::unused_type skipper_type;
    #endif
        typedef qi::rule<Iterator, skipper_type> simple_rule;
    
        simple_rule program, statement, create_statement, create_table, table_constraints, constraint_definition;
        simple_rule primary_key_constraint, create_table_columns, column_definition, type_constraint, default_value, column_type;
    };
    
    std::string cin2string()
    {
        std::istreambuf_iterator<char> f(std::cin), l;
        std::string result;
        std::copy(f, l, std::back_inserter(result));
        return result;
    }
    
    int main(int argc, char* argv[])
    {
        // iterator type used to expose the underlying input stream
        typedef std::string::const_iterator base_iterator_type;
    
        // This is the lexer token type to use.
        typedef lex::lexertl::token<
            base_iterator_type, boost::mpl::vector<int, std::size_t, std::string> 
        > token_type;
    
        #ifdef STATE_WS
            typedef lex::lexertl::lexer<token_type> lexer_type;
        #else
            typedef lex::lexertl::actor_lexer<token_type> lexer_type;
        #endif
    
        // This is the token definition type (derived from the given lexer type).
        typedef sql_tokens<lexer_type> sql_tokens;
    
        // this is the iterator type exposed by the lexer 
        typedef sql_tokens::iterator_type iterator_type;
    
        // this is the type of the grammar to parse
        typedef sql_grammar<iterator_type, sql_tokens::lexer_def> sql_grammar;
    
        // now we use the types defined above to create the lexer and grammar
        // object instances needed to invoke the parsing process
        sql_tokens tokens;                         // Our lexer
        sql_grammar sql(tokens);                  // Our parser
    
        const std::string str = cin2string();
    
        // At this point we generate the iterator pair used to expose the
        // tokenized input stream.
        base_iterator_type it = str.begin();
        iterator_type iter = tokens.begin(it, str.end());
        iterator_type end = tokens.end();
    
        // Parsing is done based on the the token stream, not the character 
        // stream read from the input.
        // Note how we use the lexer defined above as the skip parser. It must
        // be explicitly wrapped inside a state directive, switching the lexer 
        // state for the duration of skipping whitespace.
    #ifdef STATE_WS
        std::string ws("WS");
        bool r = qi::phrase_parse(iter, end, sql, qi::in_state(ws)[tokens.self]);
    #else
        bool r = qi::parse(iter, end, sql);
    #endif
    
        if (r && iter == end)
        {
            std::cout << "-------------------------\n";
            std::cout << "Parsing succeeded\n";
            std::cout << "-------------------------\n";
        }
        else
        {
            std::cout << "-------------------------\n";
            std::cout << "Parsing failed\n";
            std::cout << "-------------------------\n";
        }
        return 0;
    }