c++boost-spiritboost-spirit-lex

how to get rid of escape character in a token with spirit::lex?


I want to tokenize my own extension of SQL syntax. This involves recognizing an escaped double quote inside a double quoted string. E.g. in MySQL these two string tokens are equivalent: """" (the second double quote acts as an escape character) and '"'. I have tried different things but I am stuck at how to replace a token's value.

#include <boost/spirit/include/lex_lexertl.hpp>
namespace lex = boost::spirit::lex;

template <typename Lexer>
struct sql_tokens : lex::lexer<Lexer>
{
  sql_tokens()
  {
    string_quote_double = "\\\"";    // '"'

    this->self("INITIAL")
      = string_quote_double [ lex::_state = "STRING_DOUBLE" ] // how to also ignore + ctx.more()?
      | ...
      ;

    this->self("STRING_DOUBLE") 
      = lex::token_def<>("[^\\\"]*") // action: ignore + ctx.more()
      | lex::token_def<>("\\\"\\\"") // how to set token value to '"' ?
      | lex::token_def<>("\\\"") [ lex::_state = "INITIAL" ]
      ;
  }

  lex::token_def<> string_quote_double, ...;
};

So how to set the token's value to " when "" has been found?

Apart from that I have also the following question: I can write a functor for a semantic action to call ctx.more() and ignore the token at the same time (thus combining "low level" tokens into a "high level" string token). But how to elegantly combine this with lex::_state = ".." ?


Solution

  • EDITED in response to comment, see below "UPDATE""


    I suggest not trying to solve that in the lexer. Let the lexer yield raw strings:

    template <typename Lexer>
        struct mylexer_t : lex::lexer<Lexer>
    {
        mylexer_t()
        {
            string_quote_double = "\\\"([^\"]|\\\"\\\")*\\\"";
    
            this->self("INITIAL")
                = string_quote_double
                | lex::token_def<>("[ \t\r\n]") [ lex::_pass = lex::pass_flags::pass_ignore ]
                ;
        }
    
        lex::token_def<std::string> string_quote_double;
    };
    

    NOTE That exposing a token attribute like that, requires a modified token typedef:

    typedef lex::lexertl::token<char const*, boost::mpl::vector<char, std::string> > token_type;
    typedef lex::lexertl::actor_lexer<token_type> lexer_type;
    

    Postprocess in the parser:

    template <typename Iterator> struct mygrammar_t
        : public qi::grammar<Iterator, std::vector<std::string>()>
    {
        typedef mygrammar_t<Iterator> This;
    
        template <typename TokenDef>
            mygrammar_t(TokenDef const& tok) : mygrammar_t::base_type(start)
        {
            using namespace qi;
    
            string_quote_double %= tok.string_quote_double [ undoublequote ];
            start = *string_quote_double;
    
            BOOST_SPIRIT_DEBUG_NODES((start)(string_quote_double));
        }
    
      private:
        qi::rule<Iterator, std::vector<std::string>()> start;
        qi::rule<Iterator, std::string()> string_quote_double;
    };
    

    As you can see, undoubleqoute can be any Phoenix actor that satisfies the criteria for a Spirit semantic action. A brain-dead example implementation would be:

    static bool undoublequote(std::string& val)
    {
        auto outidx = 0;
        for(auto in = val.begin(); in!=val.end(); ++in) {
            switch(*in) {
                case '"': 
                    if (++in == val.end()) { // eat the escape
                        // end of input reached
                        val.resize(outidx); // resize to effective chars
                        return true;
                    }
                    // fall through
                default:
                    val[outidx++] = *in; // append the character
            }
        }
    
        return false; // not ended with double quote as expected
    }
    

    But I suggest you write a "proper" de-escaper (as I'm pretty sure MySql will allow \t, \r, \u001e or even more archaic stuff as well).

    I have some more complete samples in old answers here:


    UPDATE

    In fact, as you indicated, it is fairly easy to integrate the attribute value normalization into the lexer itself:

    template <typename Lexer>
        struct mylexer_t : lex::lexer<Lexer>
    {
        struct undoublequote_lex_type {
            template <typename, typename, typename, typename> struct result { typedef void type; };
    
            template <typename It, typename IdType, typename pass_flag, typename Ctx>
                void operator()(It& f, It& l, pass_flag& pass, IdType& id, Ctx& ctx) const {
                    std::string raw(f,l);
                    if (undoublequote(raw))
                        ctx.set_value(raw);
                    else
                        pass = lex::pass_flags::pass_fail;
                }
        } undoublequote_lex;
    
        mylexer_t()
        {
            string_quote_double = "\\\"([^\"]|\\\"\\\")*\\\"";
    
            const static undoublequote_lex_type undoublequote_lex;
            this->self("INITIAL")
                = string_quote_double [ undoublequote_lex ]
                | lex::token_def<>("[ \t\r\n]") [ lex::_pass = lex::pass_flags::pass_ignore ]
                ;
        }
    
        lex::token_def<std::string> string_quote_double;
    };
    

    This reuses the same undoublequote function shown above, but wraps it in Deferred Callable Object (or "polymorphic functor") undoublequote_lex_type that satisfies the criteria for a Lexer Semantic Action.


    Here is a fully working proof of concept:

    //#include <boost/config/warning_disable.hpp>
    //#define BOOST_SPIRIT_DEBUG_PRINT_SOME 80
    //#define BOOST_SPIRIT_DEBUG // before including Spirit
    #include <boost/spirit/include/lex_lexertl.hpp>
    #include <boost/spirit/include/qi.hpp>
    #include <fstream>
    #ifdef MEMORY_MAPPED
    #   include <boost/iostreams/device/mapped_file.hpp>
    #endif
    //#include <boost/spirit/include/lex_generate_static_lexertl.hpp>
    
    namespace /*anon*/
    {
        namespace phx=boost::phoenix;
        namespace qi =boost::spirit::qi;
        namespace lex=boost::spirit::lex;
    
        template <typename Lexer>
            struct mylexer_t : lex::lexer<Lexer>
        {
            mylexer_t()
            {
                string_quote_double = "\\\"([^\"]|\\\"\\\")*\\\"";
    
                this->self("INITIAL")
                    = string_quote_double
                    | lex::token_def<>("[ \t\r\n]") [ lex::_pass = lex::pass_flags::pass_ignore ]
                    ;
            }
    
            lex::token_def<std::string> string_quote_double;
        };
    
        static bool undoublequote(std::string& val)
        {
            auto outidx = 0;
            for(auto in = val.begin(); in!=val.end(); ++in) {
                switch(*in) {
                    case '"': 
                        if (++in == val.end()) { // eat the escape
                            // end of input reached
                            val.resize(outidx); // resize to effective chars
                            return true;
                        }
                        // fall through
                    default:
                        val[outidx++] = *in; // append the character
                }
            }
    
            return false; // not ended with double quote as expected
        }
    
        template <typename Iterator> struct mygrammar_t
            : public qi::grammar<Iterator, std::vector<std::string>()>
        {
            typedef mygrammar_t<Iterator> This;
    
            template <typename TokenDef>
                mygrammar_t(TokenDef const& tok) : mygrammar_t::base_type(start)
            {
                using namespace qi;
    
                string_quote_double %= tok.string_quote_double [ undoublequote ];
                start = *string_quote_double;
    
                BOOST_SPIRIT_DEBUG_NODES((start)(string_quote_double));
            }
    
          private:
            qi::rule<Iterator, std::vector<std::string>()> start;
            qi::rule<Iterator, std::string()> string_quote_double;
        };
    }
    
    std::vector<std::string> do_test_parse(const std::string& v)
    {
        char const *first = &v[0];
        char const *last = first+v.size();
    
        typedef lex::lexertl::token<char const*, boost::mpl::vector<char, std::string> > token_type;
        typedef lex::lexertl::actor_lexer<token_type> lexer_type;
    
        typedef mylexer_t<lexer_type>::iterator_type iterator_type;
        const static mylexer_t<lexer_type> mylexer;
        const static mygrammar_t<iterator_type> parser(mylexer);
    
        auto iter = mylexer.begin(first, last);
        auto end = mylexer.end();
    
        std::vector<std::string> data;
        bool r = qi::parse(iter, end, parser, data);
    
        r = r && (iter == end);
    
        if (!r)
            std::cerr << "parsing (" << iter->state() << ") failed at: '" << std::string(first, last) << "'\n";
    
        return data;
    }
    
    int main(int argc, const char *argv[])
    {
        for (auto&& s : do_test_parse( "\"bla\"\"blo\""))
            std::cout << s << std::endl;
    }