Sorry if this is a newbie question, but I need to know which token definition produced a certain token. When I print the token ID, I just get an integer. I need to know which regex generated this token.
Here's how I define my tokens:
template <typename LexerT>
class Tokens: public lex::lexer<LexerT>
Tokens(const std::string& input):
using boost::spirit::lex::_start;
using boost::spirit::lex::_end;
using boost::spirit::lex::_pass;
using boost::phoenix::ref;
using boost::phoenix::construct;
// macros
("EXP", "(e|E)(\\+|-)?\\d+")
("SUFFIX", "[yzafpnumkKMGTPEZY]")
("INTEGER", "-?\\d+")
("FLOAT", "-?(((\\d+)|(\\d*\\.\\d+)|(\\d+\\.\\d*))({EXP}|{SUFFIX})?)")
("SYMBOL", "[a-zA-Z_?@](\\w|\\?|@)*")
("STRING", "\\\"([^\\\"]|\\\\\\\")*\\\"");
// whitespaces and comments
whitespaces_ = "\\s+";
comments_ = "(;[^\\n]*\\n)|(\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/)";
// literals
integer_ = "{INTEGER}";
float_ = "{FLOAT}";
symbol_ = "{SYMBOL}";
string_ = "{STRING}";
// operators
quote_ = "'";
backquote_ = '`';
// ... other tokens
// whitespace and comment rules
this->self += whitespaces_ [ref(lineNo_) += count(_start, _end, '\n'), _pass = lex::pass_flags::pass_ignore];
this->self += comments_ [ref(lineNo_) += count(_start, _end, '\n'), _pass = lex::pass_flags::pass_ignore];
// literal rules
this->self += integer_ | float_ | string_ | symbol_;
// this->self += ... other tokens
~Tokens() {}
size_t lineNo() { return lineNo_; }
// ignored tokens
lex::token_def<lex::omit> whitespaces_, comments_;
// literal tokens
lex::token_def<int> integer_;
lex::token_def<std::string> float_, symbol_, string_;
// operator tokens
lex::token_def<> quote_, backquote_;
// ... other token definitions of type lex::token_def<>
// current line number
size_t lineNo_;
Thanks, Haitham
From the docs
To ensure every token gets assigned a id the Spirit.Lex library internally assigns unique numbers to the token definitions, starting with the constant defined by
So you can in fact get the token id incrementally assigned. However, to make things a little bit more friendly/robust, I'd suggest makeing a helper function to determine the name of the token, so you can do something like this:
while (iter != end && token_is_valid(*iter))
std::cout << "Token: " <<
(iter->id() - lex::min_token_id) << ": " <<
toklexer.nameof(iter) << " ('" << iter->value() << "')\n";
if (iter == end) { std::cout << "lineNo: " << toklexer.lineNo() << "\n"; }
Which, for input like:
const std::string str = "symbol \"string\" \n"
"this /* is a comment */\n"
"31415926E-7 123";
Would print:
Token: 5: symbol_ ('symbol')
Token: 4: string_ ('"string"')
Token: 5: symbol_ ('this')
Token: 3: float_ ('31415926E-7')
Token: 2: integer_ ('123')
lineNo: 3
?) but I can't currently find the documentation for it. The implementation of the Tokens::nameof(It)
function would be greatly simplified if you could reuse the debug-name.Fully working demo code (slightly adapted to Boost 1_49-1_57, GCC -std=c++0x):
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/phoenix/function/adapt_callable.hpp>
namespace qi = boost::spirit::qi;
namespace lex = boost::spirit::lex;
namespace phx = boost::phoenix;
// irrelevant for question: needed this locally to make it work with my boost
// version
namespace detail {
struct count {
template<class It1, class It2, class T> struct result { typedef ptrdiff_t type; };
template<class It1, class It2, class T>
typename result<It1, It2, T>::type operator()(It1 f, It2 l, T const& x) const {
return std::count(f, l, x);
BOOST_PHOENIX_ADAPT_CALLABLE(count, detail::count, 3);
template <typename LexerT>
class Tokens: public lex::lexer<LexerT>
using lex::_start;
using lex::_end;
using lex::_pass;
using phx::ref;
// macros
("EXP", "(e|E)(\\+|-)?\\d+")
("SUFFIX", "[yzafpnumkKMGTPEZY]")
("INTEGER", "-?\\d+")
("FLOAT", "-?(((\\d+)|(\\d*\\.\\d+)|(\\d+\\.\\d*))({EXP}|{SUFFIX})?)")
("SYMBOL", "[a-zA-Z_?@](\\w|\\?|@)*")
("STRING", "\\\"([^\\\"]|\\\\\\\")*\\\"");
// whitespaces and comments
whitespaces_ = "\\s+";
comments_ = "(;[^\\n]*\\n)|(\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/)";
// literals
integer_ = "{INTEGER}";
float_ = "{FLOAT}";
symbol_ = "{SYMBOL}";
string_ = "{STRING}";
// operators
quote_ = "'";
backquote_ = '`';
// ... other tokens
// whitespace and comment rules
//this->self.add(whitespaces_, 1001)
//(comments_, 1002);
this->self = whitespaces_ [phx::ref(lineNo_) += count(_start, _end, '\n'), _pass = lex::pass_flags::pass_ignore]
| comments_ [phx::ref(lineNo_) += count(_start, _end, '\n'), _pass = lex::pass_flags::pass_ignore];
// literal rules
this->self += integer_ | float_ | string_ | symbol_;
// this->self += ... other tokens
template <typename TokIter>
std::string nameof(TokIter it)
if (it->id() == return "whitespaces_";
if (it->id() == return "comments_";
if (it->id() == return "integer_";
if (it->id() == return "float_";
if (it->id() == return "symbol_";
if (it->id() == return "string_";
if (it->id() == return "quote_";
if (it->id() == return "backquote_";
return "other";
~Tokens() {}
size_t lineNo() { return lineNo_; }
// ignored tokens
lex::token_def</*lex::omit*/> whitespaces_, comments_;
// literal tokens
lex::token_def<int> integer_;
lex::token_def<std::string> float_, symbol_, string_;
// operator tokens
lex::token_def<> quote_, backquote_;
// ... other token definitions of type lex::token_def<>
// current line number
size_t lineNo_;
int main()
const std::string str = "symbol \"string\" \n"
"this /* is a comment */\n"
"31415926E-7 123";
typedef lex::lexertl::token<char const*> token_type;
typedef lex::lexertl::actor_lexer<token_type> lexer_type;
Tokens<lexer_type> toklexer;
char const* first = str.c_str();
char const* last = &first[str.size()];
lexer_type::iterator_type iter = toklexer.begin(first, last);
lexer_type::iterator_type end = toklexer.end();
while (iter != end && token_is_valid(*iter))
std::cout << "Token: " <<
(iter->id() - lex::min_token_id) << ": " <<
toklexer.nameof(iter) << " ('" << iter->value() << "')\n";
if (iter == end) { std::cout << "lineNo: " << toklexer.lineNo() << "\n"; }
else {
std::string rest(first, last);
std::cout << "Lexical analysis failed\n" << "stopped at: \""
<< rest << "\"\n";
return 0;