c++boost-spiritboost-spirit-qi

Parsing unfinished text in boost::spirit::qi


I want to parse a text which is read by using ifstream::read. The problem I'm facing is that the parser always return an expectation failure when reading an unfinished text. Here is my code for the parser.

template <typename It, typename Skipper= qi::space_type>
struct xmlparser: qi::grammar<It, std::string(), Skipper>{
    xmlparser(): xmlparser::base_type(xml_parser){
        using qi::lit;
        using qi::lexeme;
        using ascii::char_;
        using boost::phoenix::ref;
        using qi::debug;
        using boost::spirit::ascii::space;
        
        skipper= qi::char_("\t\r\n "); //qi::skip(skipper.alias())
        text = !lit('<') >> +(qi::char_ - qi::char_("<")) | lit('\'') | lit('\"');
        prolog = "<?" >> +(qi::char_ - '?') >> "?>";
        name = lexeme[qi::char_("a-zA-Z:_") >> *qi::char_("-a-zA-Z0-9:_")];
        attribute_value =
                    '"' > +(char_ - char_("<&\"")) > '"'
                    | '\'' > +(char_ - char_("<&'")) > '\''
                ;
        attribute = name[print_action("ATT")] >  '=' >  attribute_value[print_action("ATT VALUE")];
        start_tag %= '<' >> !lit('/') >> name >> *(attribute) >> !lit('/')>> '>';
        end_tag = "</" >> name >> '>';
        empty_tag =  '<' >> name >> *(attribute) >> "/>";
        xml_parser = 
            *(text/*[print_action("TEXT")]*/ 
                |start_tag[/*++ref(open_tag_count)*/print_action("OPEN")] 
                | end_tag[/*++ref(end_tag_count)*/print_action("END")] 
                | empty_tag[/*++ref(empty_tag_count)*/print_action("EMPTY")] 
                | prolog
                | skipper
            );
        }

        int get_empty_tag_count(){
            return empty_tag_count;
        }

        int get_open_tag_count(){
            return open_tag_count;
        }

        int get_end_tag_count(){
            return end_tag_count;
        }

        private:
            int open_tag_count= 0;
            int end_tag_count= 0;
            int empty_tag_count= 0;
            int text_count=0;

            qi::rule<It> skipper;
            qi::rule<It, std::string()> text;
            qi::rule<It, std::string()> prolog;
            qi::rule<It, std::string(),Skipper> name;
            qi::rule<It, std::string()> attribute_value;
            qi::rule<It, std::string(),Skipper> attribute;
            qi::rule<It, std::string(),Skipper> start_tag;
            qi::rule<It, std::string(),Skipper> end_tag;
            qi::rule<It, std::string(),Skipper> empty_tag;
            qi::rule<It, std::string(),Skipper> xml_parser;
};

I do not have any issues when I'm reading the text in using ifstream::getline since the text fed into the parser can be considered complete. However, when I'm reading the text by using ifstream::read, for example if it happened that the char[bufsize] stops at the middle of parsing the xml attribute and then it will return an expectation failure.

Example of unfinished text

</description>
<shipping>Will ship only within country, See description for charges</shipping>
<incategory category="category317"/>
<incategory category="categ

The function to read characters

char * buffer= new char[bufsize];
input_file.read(buffer,bufsize);
std::string bufstring(buffer);
if (extra != ""){
   bufstring = extra + bufstring;
   extra= "";
}

I wish to know if it is possible to return the fail parsed value and then added to the subsequent read from the buffer, since the subsequent read contains the continuation of the unfinished text. I have tried writing try and catch in order to put the failed to be parsed text to the next character read, but it doesn't seem to work.

    if (extra != ""){
        bufstring = extra + bufstring;
        extra= "";
    }
    // std::cout << bufstring << std::endl << std::endl;
    std::string::const_iterator iter= bufstring.begin();
    std::string::const_iterator end= bufstring.end();
    try{
        bool r= qi::phrase_parse(iter,end,xml_parser,qi::space);
        if (!r){
            std::cout << "Error found" << std::endl;
            extra = std::string(iter,end);
            std::cout << extra << std::endl;
            delete[] buffer;
            return;
        }
        if (iter!=end){
            extra = std::string(iter,end);
            // std::cout << extra << std::endl;
        }
    } catch (expectation_failure<char const*> const& e){
        std::cout<< std::string(iter,end) << std::endl;
        extra = std::string(iter,end);
    }

Solution

  • Don't roll your own XML parser. XML libraries have stream parsers for this purpose. They work a bunch. I could go look for my xpathreader implementation if you're interested.

    That said, maybe you're just trying learn Spirit Qi. Let's dig in.

    Lots of small observations:

    Putting Together A Demo

    Live On Coliru

    //#define BOOST_SPIRIT_DEBUG
    #include <boost/phoenix.hpp>
    #include <boost/spirit/include/qi.hpp>
    #include <boost/spirit/include/qi_match.hpp>
    #include <boost/spirit/include/support_istream_iterator.hpp>
    #include <iomanip>
    namespace qi = boost::spirit::qi;
    
    template <typename It, typename Skipper = qi::space_type>
    struct xmlparser : qi::grammar<It, std::string(), Skipper> {
        xmlparser() : xmlparser::base_type(start) {
            using boost::phoenix::ref;
            using boost::phoenix::val;
            using qi::lit;
    
            skipper         = qi::eps(false); // qi::char_("\t\r\n ");
            text            = +~qi::char_("<");
            prolog          = "<?" >> +(qi::char_ - '?') >> "?>";
            name            = qi::char_("a-zA-Z:_") >> *qi::char_("-a-zA-Z0-9:_");
            attribute_value = //
                '"' > *~qi::char_("<&\"") > '"' |
                '\'' > *~qi::char_("<&'") > '\'';
    
            //boost::phoenix::function print_action = [](auto&& arg) { std::cout << arg << "\n"; };
            using namespace qi::labels;
            auto print_action = [](auto caption) {
                return std::cout << boost::phoenix::val(caption) << "[" << _1 << "] ";
            };
    
    #define INIT = // %=
            attribute INIT name[print_action("ATT")] > '=' >
                attribute_value[print_action("ATT VALUE")];
            start_tag INIT '<' >> !lit('/') >> name >> *attribute >> !lit('/') >> '>';
            end_tag   INIT "</" >> name >> '>';
            empty_tag INIT '<' >> name >> *attribute >> "/>";
            start INIT *(//
                          prolog |
                end_tag   [ (++ref(end_tag_count), print_action("END"))     ] |
                empty_tag [ (++ref(empty_tag_count), print_action("EMPTY")) ] |
                start_tag [ (++ref(open_tag_count), print_action("OPEN"))   ] |
                text      [ print_action("TEXT")                            ] |
                skipper);
    
            qi::on_error<qi::fail>(
                attribute,
                boost::phoenix::if_(_3 != _2) // at end of input?
                    [boost::phoenix::throw_(
                         std::runtime_error("Expectation failure"))]
                        .else_[std::cout << val("[EOF] Expecting ") << _4
                                         << std::endl]);
    
            BOOST_SPIRIT_DEBUG_NODES((skipper)(text)(prolog)(attribute)(start_tag)(
                end_tag)(empty_tag)(start)(name)(attribute_value))
    
        }
    
        int get_empty_tag_count() { return empty_tag_count; }
        int get_open_tag_count()  { return open_tag_count;  }
        int get_end_tag_count()   { return end_tag_count;   }
    
        void report(std::ostream& os) const {
            os << "open_tag_count  : " << open_tag_count  << std::endl;
            os << "end_tag_count   : " << end_tag_count   << std::endl;
            os << "empty_tag_count : " << empty_tag_count << std::endl;
            os << "text_count      : " << text_count      << std::endl;
        }
    
      private:
    
        int mutable open_tag_count  = 0;
        int mutable end_tag_count   = 0;
        int mutable empty_tag_count = 0;
        int mutable text_count      = 0;
    
        qi::rule<It, std::string(), Skipper> start, //
            attribute, start_tag, end_tag, empty_tag;
    
        qi::rule<It, std::string()> // lexemes
            prolog, text, name, attribute_value;
        qi::rule<It> skipper;
    };
    
    int main() {
        xmlparser<boost::spirit::istream_iterator> const p;
        std::istringstream iss(R"(
            </description>
            <shipping>Will ship only within country, See description for charges</shipping>
            <incategory category="category317"/>
            <incategory category="categ)");
    
        std::cout << iss.str() << "\n--------------------\n";
    
        for (std::string output;
             iss >> std::noskipws >> qi::phrase_match(p, qi::space, output);
             output.clear()) //
        {
            std::cout << "\n -- Output: " << std::quoted(output) << "\n";
        }
    
        p.report(std::cout << "\n");
    
        iss.clear();
        std::cout << "\n -- Remaining: " << iss.rdbuf() << std::endl;
    }
    

    Prints

            </description>
            <shipping>Will ship only within country, See description for charges</shipping>
            <incategory category="category317"/>
            <incategory category="categ
    --------------------
    END[description] OPEN[shipping] TEXT[Will ship only within country, See description for charges] END[shipping] ATT[category] ATT VALUE[category317] EMPTY[incategory] ATT[category] [EOF] Expecting """
    ATT[category] [EOF] Expecting """
    
     -- Output: ""
    
    open_tag_count  : 1
    end_tag_count   : 2
    empty_tag_count : 1
    text_count      : 0
    
     -- Remaining: 
    

    And, if enabled, the debug output:

    <start>
      <try>\n        </descripti</try>
      <prolog>
        <try></description>\n     </try>
        <fail/>
      </prolog>
      <end_tag>
        <try></description>\n     </try>
        <name>
          <try>description>\n       </try>
          <success>>\n        <shipping></success>
          <attributes>[[d, e, s, c, r, i, p, t, i, o, n]]</attributes>
        </name>
        <success>\n        <shipping>W</success>
        <attributes>[[d, e, s, c, r, i, p, t, i, o, n]]</attributes>
      </end_tag>
      <prolog>
        <try><shipping>Will ship </try>
        <fail/>
      </prolog>
      <end_tag>
        <try><shipping>Will ship </try>
        <fail/>
      </end_tag>
      <empty_tag>
        <try><shipping>Will ship </try>
        <name>
          <try>shipping>Will ship o</try>
          <success>>Will ship only with</success>
          <attributes>[[s, h, i, p, p, i, n, g]]</attributes>
        </name>
        <attribute>
          <try>>Will ship only with</try>
          <name>
            <try>>Will ship only with</try>
            <fail/>
          </name>
          <fail/>
        </attribute>
        <fail/>
      </empty_tag>
      <start_tag>
        <try><shipping>Will ship </try>
        <name>
          <try>shipping>Will ship o</try>
          <success>>Will ship only with</success>
          <attributes>[[s, h, i, p, p, i, n, g]]</attributes>
        </name>
        <attribute>
          <try>>Will ship only with</try>
          <name>
            <try>>Will ship only with</try>
            <fail/>
          </name>
          <fail/>
        </attribute>
        <success>Will ship only withi</success>
        <attributes>[[s, h, i, p, p, i, n, g]]</attributes>
      </start_tag>
      <prolog>
        <try>Will ship only withi</try>
        <fail/>
      </prolog>
      <end_tag>
        <try>Will ship only withi</try>
        <fail/>
      </end_tag>
      <empty_tag>
        <try>Will ship only withi</try>
        <fail/>
      </empty_tag>
      <start_tag>
        <try>Will ship only withi</try>
        <fail/>
      </start_tag>
      <text>
        <try>Will ship only withi</try>
        <success></shipping>\n        </success>
        <attributes>[[W, i, l, l,  , s, h, i, p,  , o, n, l, y,  , w, i, t, h, i, n,  , c, o, u, n, t, r, y, ,,  , S, e, e,  , d, e, s, c, r, i, p, t, i, o, n,  , f, o, r,  , c, h, a, r, g, e, s]]</attributes>
      </text>
      <prolog>
        <try></shipping>\n        </try>
        <fail/>
      </prolog>
      <end_tag>
        <try></shipping>\n        </try>
        <name>
          <try>shipping>\n        <i</try>
          <success>>\n        <incategor</success>
          <attributes>[[s, h, i, p, p, i, n, g]]</attributes>
        </name>
        <success>\n        <incategory</success>
        <attributes>[[s, h, i, p, p, i, n, g]]</attributes>
      </end_tag>
      <prolog>
        <try><incategory category</try>
        <fail/>
      </prolog>
      <end_tag>
        <try><incategory category</try>
        <fail/>
      </end_tag>
      <empty_tag>
        <try><incategory category</try>
        <name>
          <try>incategory category=</try>
          <success> category="category3</success>
          <attributes>[[i, n, c, a, t, e, g, o, r, y]]</attributes>
        </name>
        <attribute>
          <try> category="category3</try>
          <name>
            <try>category="category31</try>
            <success>="category317"/>\n   </success>
            <attributes>[[c, a, t, e, g, o, r, y]]</attributes>
          </name>
          <attribute_value>
            <try>"category317"/>\n    </try>
            <success>/>\n        <incatego</success>
            <attributes>[[c, a, t, e, g, o, r, y, 3, 1, 7]]</attributes>
          </attribute_value>
          <success>/>\n        <incatego</success>
          <attributes>[[i, n, c, a, t, e, g, o, r, y]]</attributes>
        </attribute>
        <attribute>
          <try>/>\n        <incatego</try>
          <name>
            <try>/>\n        <incatego</try>
            <fail/>
          </name>
          <fail/>
        </attribute>
        <success>\n        <incategory</success>
        <attributes>[[i, n, c, a, t, e, g, o, r, y]]</attributes>
      </empty_tag>
      <prolog>
        <try><incategory category</try>
        <fail/>
      </prolog>
      <end_tag>
        <try><incategory category</try>
        <fail/>
      </end_tag>
      <empty_tag>
        <try><incategory category</try>
        <name>
          <try>incategory category=</try>
          <success> category="categ</success>
          <attributes>[[i, n, c, a, t, e, g, o, r, y]]</attributes>
        </name>
        <attribute>
          <try> category="categ</try>
          <name>
            <try>category="categ</try>
            <success>="categ</success>
            <attributes>[[c, a, t, e, g, o, r, y]]</attributes>
          </name>
          <attribute_value>
            <try>"categ</try>
            <fail/>
          </attribute_value>
          <fail/>
        </attribute>
        <fail/>
      </empty_tag>
      <start_tag>
        <try><incategory category</try>
        <name>
          <try>incategory category=</try>
          <success> category="categ</success>
          <attributes>[[i, n, c, a, t, e, g, o, r, y]]</attributes>
        </name>
        <attribute>
          <try> category="categ</try>
          <name>
            <try>category="categ</try>
            <success>="categ</success>
            <attributes>[[c, a, t, e, g, o, r, y]]</attributes>
          </name>
          <attribute_value>
            <try>"categ</try>
            <fail/>
          </attribute_value>
          <fail/>
        </attribute>
        <fail/>
      </start_tag>
      <text>
        <try><incategory category</try>
        <fail/>
      </text>
      <skipper>
        <try><incategory category</try>
        <fail/>
      </skipper>
      <success><incategory category</success>
      <attributes>[[]]</attributes>
    </start>
    <start>
      <try></try>
      <prolog>
        <try></try>
        <fail/>
      </prolog>
      <end_tag>
        <try></try>
        <fail/>
      </end_tag>
      <empty_tag>
        <try></try>
        <fail/>
      </empty_tag>
      <start_tag>
        <try></try>
        <fail/>
      </start_tag>
      <text>
        <try></try>
        <fail/>
      </text>
      <skipper>
        <try></try>
        <fail/>
      </skipper>
      <success></success>
      <attributes>[[]]</attributes>
    </start>
    

    PS

    If you #define INIT %= you will see the problem with exposing std::string() on each string:

    END[description] OPEN[descriptionshippingshipping] TEXT[descriptionshippingshippingWill ship only within country, See description for charges] END[descriptionshippingshippingWill ship only within country, See description for chargesshipping] ATT[descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategory] ATT VALUE[descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategorycategory317] EMPTY[descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategorycategory317] ATT[descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategorycategory317incategorycategory] [EOF] Expecting """
    ATT[descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategorycategory317incategorycategorycategincategorycategory] [EOF] Expecting """
    
     -- Output: "descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategorycategory317incategorycategorycategincategorycategorycateg"