regexc++11gcclibstdc++

Unexpected output of C++11 regex?


#include <iostream>
#include <regex>
#include <string>
#include <vector>
using namespace std;
#define debug(exp) do { cout << #exp << ": " << (exp) << endl; } while (0)
int main (int argc, char *argv[])
{
  vector<string> res;
  string pattern (""),
    text ("ilovechina");
  vector<string> dict = { "i", "love", "china", "lovechina", "ilove" };
  vector<string>::iterator dict_it = dict.begin ();
  if (dict_it == dict.end ()) throw invalid_argument (__func__);
  while (true)
    {
      pattern += *dict_it;
      if (++dict_it == dict.end ()) break;
      else pattern += "|";
    }
  debug (pattern);
  regex re (pattern);
  sregex_iterator ri (text.begin (), text.end (), re),
    ri_end;
  for (; ri != ri_end; ++ri)
    {
      ssub_match sm = (*ri)[0];

      // Edit:
      // Problem here. match_results::position shouldn't
      // be used to check matches.
      // This `if` is originally to check instant matches. 
      // The standard says it's the distance from the target sequence
      // but I misunderstood which the target is.
      // "The string being searched" is the correct name.
      if (ri->position (0)) // The near string does not correspond to the dictionary
    {
      debug (ri->position (0));
      debug (ri->length (0));
      debug (string (sm.first, sm.second));
      throw runtime_error ("not in dictionary");
    }
      // Edit:
      // But I used match_results::position correctly here...
      // This is originally used to check if all words
      // are "instantly" matched.
      if (ri->position (0) + ri->length (0) == text.length ()) // successfully delimited all the words in the text
    break;
      res.push_back (sm.str ());
    }
  if (ri != ri_end) throw runtime_error ("not in dictionary");
}

I expect this code to based on the elements of dict, delimit the words in text. But why does it always skip the first word "i" and goes to the second word "love", outputting 1 as the first value of ri->position (0)?

My output:

pattern: i|love|china|lovechina|ilove
ri->position (0): 1
ri->length (0): 4
string (sm.first, sm.second): love
terminate called after throwing an instance of 'std::runtime_error'
  what():  not in dictionary
Aborted (core dumped)

Probably just a stupid error.


Solution

  • Debugged version:

    #include <iostream>
    #include <regex>
    #include <string>
    #include <vector>
    using namespace std;
    #define debug(exp) do { cout << #exp << ": " << (exp) << endl; } while (0)
    vector<string> delimit_words (string text, vector<string> dict)
    {
      vector<string> res;
      string pattern;
      if (dict.empty ()) throw invalid_argument (__func__);
      vector<string>::iterator dict_it = dict.begin ();
      while (true)
        {
          pattern += *dict_it;
          if (++dict_it == dict.end ()) break;
          else pattern += "|";
        }
      regex re (pattern);
      if (text.empty ()) return res;
      sregex_iterator ri (text.begin (), text.end (), re),
        ri_end;
      for (; ri != ri_end; ++ri)
        {
          if (ri->prefix ().length () != 0)
        throw runtime_error ("not a instant match");
          res.push_back (ri->str (0));
          if (ri->position (0) + ri->length (0) == text.length ())
        return res;
        }
      throw runtime_error ("search failed");
    }
    
    int main (int argc, char *argv[])
    {
      string text = "ilovechina";
      vector<string> dict = { "i", "love", "china", "lovechina", "ilove" };
      vector<string> words = delimit_words (text, dict);
      for (vector<string>::iterator it = words.begin ();
           it != words.end ();
           ++it)
        {
          cout << *it << ' ';
        }
      cout << endl;
      return 0;
    }