I have a lexer and based on that lexer I now want to create a grammar that used the tokens generated by this lexer. I tried adapting some examples that I found and now I have something that compiles and works at least a little bit, but one of my tests that should fail does not. Now I want to know why and I also want to know what I'm actually doing there (I want to understand - I just copied some code from some examples but that doesn't really improve the understanding much).
#include <boost/spirit/include/lex_lexertl.hpp>
namespace lex = boost::spirit::lex;
template <typename Lexer>
struct custom_lexer : lex::lexer<Lexer>
: identifier("[a-zA-Z_][a-zA-Z0-9_]*")
, white_space("[ \\t\\n]+")
, integer_value("[1-9][0-9]*")
, hex_value("0[xX][0-9a-fA-F]+")
, float_value("[0-9]*\\.[0-9]+([eE][+-]?[0-9]+)?")
, float_value2("[0-9]+\\.([eE][+-]?[0-9]+)?")
, punctuator("\\[|\\]|\\(|\\)|\\.|&>|\\*\\*|\\*|\\+|-|~|!|\\/|%|<<|>>|<|>|<=|>=|==|!=|\\^|&|\\||\\^\\^|&&|\\|\\||\\?|:|,")// [ ] ( ) . &> ** * + - ~ ! / % << >> < > <= >= == != ^ & | ^^ && || ? : ,
using boost::spirit::lex::_start;
using boost::spirit::lex::_end;
(identifier, ID_IDENTIFIER)
/*(white_space, ID_WHITESPACE)*/
(integer_value, ID_INTEGER)
(hex_value, ID_INTEGER)
(float_value, ID_FLOAT)
(float_value2, ID_FLOAT)
(punctuator, ID_PUNCTUATOR);
this->self("WS") = white_space;
lex::token_def<std::string> identifier;
lex::token_def<lex::omit> white_space;
lex::token_def<int> integer_value;
lex::token_def<int> hex_value;
lex::token_def<double> float_value;
lex::token_def<double> float_value2;
lex::token_def<> punctuator;
namespace qi = boost::spirit::qi;
namespace lex = boost::spirit::lex;
template< typename Iterator, typename Lexer>
struct custom_grammar : qi::grammar<Iterator, qi::in_state_skipper<Lexer>>
template< typename TokenDef >
custom_grammar(const TokenDef& tok) : custom_grammar::base_type(ges)
ges = qi::token(ID_INTEGER) | qi::token(ID_FLOAT);
qi::rule<Iterator, qi::in_state_skipper<Lexer>> ges;
And example:
namespace lex = boost::spirit::lex;
namespace qi = boost::spirit::qi;
std::string test("1234 56");
typedef lex::lexertl::token<char const*, lex::omit, boost::mpl::true_> token_type;
typedef lex::lexertl::lexer<token_type> lexer_type;
typedef custom_lexer<lexer_type>::iterator_type iterator_type;
custom_lexer<lexer_type> my_lexer;
custom_grammar<iterator_type, custom_lexer<lexer_type>::lexer_def> my_grammar(my_lexer);
char const* first = test.c_str();
char const* last = &first[test.size()];
lexer_type::iterator_type iter = my_lexer.begin(first, last);
lexer_type::iterator_type end = my_lexer.end();
bool r = qi::phrase_parse(iter,end,my_grammar, qi::in_state( "WS" )[ my_lexer.self ]);
My assumption is that this returns true because the whitespace is skipped - because auf qi::in_state("WS"). Is that true? Additionally, I know how I can output additional tokens for whitespace - but then I don't know what to put at the location where the qi::in_stat is now - without it it isn't working.
Any ideas what I can improve regarding the structure? Why is the debug output so funny?
Thank you for your help.
You parser isn't failing, but no it isn't 'silently' skipping the whitespace either (it parses only one non-whitespace token, anyway).
In fact, a property of *phrase_parse family of Spirit APIs is that it may not match the full input. In fact, this is why it takes the first iterator by reference: after parsing the iterator will indicate where parsing stopped.
I have changed a few bits around so you can easily access the source iterator, by using lex::tokenize_and_phrase_parse
instead of qi::phrase_parse
on lexer_tokens:
Iterator first = test.c_str();
Iterator last = &first[test.size()];
bool r = lex::tokenize_and_phrase_parse(first,last,my_lexer,my_grammar,qi::in_state( "WS" )[ my_lexer.self ]);
std::cout << std::boolalpha << r << "\n";
std::cout << "Remaining unparsed: '" << std::string(first,last) << "'\n";
The output is:
Remaining unparsed: '56'
Here is a full working example (note I also changed the second parameter of the grammar class to be the Skipper directly, which is more typical for Spirit grammars):
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
namespace qi = boost::spirit::qi;
namespace lex = boost::spirit::lex;
template <typename Lexer>
struct custom_lexer : lex::lexer<Lexer>
: identifier ("[a-zA-Z_][a-zA-Z0-9_]*")
, white_space ("[ \\t\\n]+")
, integer_value ("[1-9][0-9]*")
, hex_value ("0[xX][0-9a-fA-F]+")
, float_value ("[0-9]*\\.[0-9]+([eE][+-]?[0-9]+)?")
, float_value2 ("[0-9]+\\.([eE][+-]?[0-9]+)?")
, punctuator ("\\[|\\]|\\(|\\)|\\.|&>|\\*\\*|\\*|\\+|-|~|!|\\/|%|<<|>>|<|>|<=|>=|==|!=|\\^|&|\\||\\^\\^|&&|\\|\\||\\?|:|,")// [ ] ( ) . &> ** * + - ~ ! / % << >> < > <= >= == != ^ & | ^^ && || ? : ,
using boost::spirit::lex::_start;
using boost::spirit::lex::_end;
(identifier , ID_IDENTIFIER)
/*(white_space , ID_WHITESPACE)*/
(integer_value, ID_INTEGER)
(hex_value , ID_INTEGER)
(float_value , ID_FLOAT)
(float_value2 , ID_FLOAT)
(punctuator , ID_PUNCTUATOR);
this->self("WS") = white_space;
lex::token_def<std::string> identifier;
lex::token_def<lex::omit> white_space;
lex::token_def<int> integer_value;
lex::token_def<int> hex_value;
lex::token_def<double> float_value;
lex::token_def<double> float_value2;
lex::token_def<> punctuator;
template< typename Iterator, typename Skipper>
struct custom_grammar : qi::grammar<Iterator, Skipper>
template< typename TokenDef >
custom_grammar(const TokenDef& tok) : custom_grammar::base_type(ges)
ges = qi::token(ID_INTEGER) | qi::token(ID_FLOAT);
qi::rule<Iterator, Skipper > ges;
int main()
std::string test("1234 56");
typedef char const* Iterator;
typedef lex::lexertl::token<Iterator, lex::omit, boost::mpl::true_> token_type;
typedef lex::lexertl::lexer<token_type> lexer_type;
typedef qi::in_state_skipper<custom_lexer<lexer_type>::lexer_def> skipper_type;
typedef custom_lexer<lexer_type>::iterator_type iterator_type;
custom_lexer<lexer_type> my_lexer;
custom_grammar<iterator_type, skipper_type> my_grammar(my_lexer);
Iterator first = test.c_str();
Iterator last = &first[test.size()];
bool r = lex::tokenize_and_phrase_parse(first,last,my_lexer,my_grammar,qi::in_state( "WS" )[ my_lexer.self ]);
std::cout << std::boolalpha << r << "\n";
std::cout << "Remaining unparsed: '" << std::string(first,last) << "'\n";