regexrubypegparslet

Trying to group terms by OR operator


I am trying to parse a string so that I can easily identify terms that are separated by " OR ".

I currently have the following rules and parser class setup:

class Parser < Parslet::Parser
  rule(:space)     { str(' ').repeat(1) }
  rule(:word)      { match['^\s"'].repeat(1) }
  rule(:or_op)     { space >> str('OR') >> space }
  rule(:term)      { word.as(:term) >> or_op.absent? }
  rule(:or_terms)  { (word.maybe >> or_op >> word).repeat(1).as(:or_terms) }
  rule(:clause)    { (or_terms | term).as(:clause) }
  rule(:query)     { (clause >> space.maybe).repeat.as(:query) }
  root(:query)

  def self.parse_tree_for(query)
    new.parse(query)
  end
end

This currently allows me to do:

Parser.parse_tree_for('wow -bob')
=> {:query=>[{:clause=>{:term=>"wow"@0}}]}

Parser.parse_tree_for('wow OR lol')
=> {:query=>[{:clause=>{:or_terms=>"wow OR lol"@0}}]}

Parser.parse_tree_for('wow OR lol OR omg')
=> {:query=>[{:clause=>{:or_terms=>"wow OR lol OR omg"@0}}]}

Which works ok, but ideally I would like for something that would give me those terms individually but with an or flag like: {:query=>[{:clause=>{:term=>"wow",:or=>true}},{:clause=>{:term=>"lol",:or=>true},{:clause=>{:term=>"omg",:or=>true}}]}

Is this something that should be done with a transformer? like, just set a rule in a transformer to do split(' OR ') or is there a better way to setup my rules?


Solution

  • You need an as on each thing you want to explicitly capture.

    Your or_term logic is a little flakey. Always put the required stuff first then the optional stuff.

    Try this...

    require 'parslet'
    
    class Parser < Parslet::Parser
      rule(:space)     { str(' ').repeat(1) }
      rule(:word)      { match['^\s"'].repeat(1).as(:word) }
      rule(:or_op)     { space >> str('OR') >> space }
      rule(:term)      { word.as(:term) >> or_op.absent? }
      rule(:or_terms)  { (word >> (or_op >> word).repeat(0)).as(:or_terms) }
      rule(:clause)    { (term | or_terms).as(:clause) }
      rule(:query)     { (clause >> space.maybe).repeat.as(:query) }
      root(:query)
    
      def self.parse_tree_for(query)
        new.parse(query)
      end
    end
    
    puts Parser.parse_tree_for('wow OR lol OR omg')
    # {:query=>[{:clause=>{:or_terms=>[{:word=>"wow"@0}, {:word=>"lol"@7}, {:word=>"omg"@14}]}}]}
    
    puts Parser.parse_tree_for('wow')
    # {:query=>[{:clause=>{:term=>{:word=>"wow"@0}}}]}
    

    I added as to word so they always get captured explicitly.

    It is better to capture more than you want up front, then later flatten it out with the transformer.

    Assuming you are going to extend this to cover AND aswell... you will find that making the AND and OR expressions required will make Operator precedence easier to deal with.

    require 'parslet'
    
    class Parser < Parslet::Parser
      rule(:space)     { str(' ').repeat(1) }
      rule(:word)      { match['^\s"'].repeat(1) }
      rule(:or_op)     { space >> str('OR') >> space }
      rule(:and_op)    { space >> str('AND') >> space }
      rule(:term)      { word.as(:term) }
      rule(:or_terms)  { (and_terms >> (or_op >> and_terms).repeat(0)).as(:or_terms) }
      rule(:and_terms) { (term >> (and_op >> term).repeat()).as(:and_terms) }
      rule(:clause)    { (or_terms).as(:clause) }
      rule(:query)     { (clause >> space.maybe).repeat.as(:query) }
      root(:query)
    
      def self.parse_tree_for(query)
        new.parse(query)
      end
    end
    
    pp Parser.parse_tree_for('wow OR lol OR omg')
    # {:query=>
    #   [{:clause=>
    #      {:or_terms=>
    #        [{:and_terms=>{:term=>"wow"@0}},
    #         {:and_terms=>{:term=>"lol"@7}},
    #         {:and_terms=>{:term=>"omg"@14}}]}}]}
    
    pp Parser.parse_tree_for('wow')
    # {:query=>[{:clause=>{:or_terms=>{:and_terms=>{:term=>"wow"@0}}}}]}
    
    pp Parser.parse_tree_for('wow OR lol AND omg OR bob')
    # {:query=>
    #   [{:clause=>
    #      {:or_terms=>
    #        [{:and_terms=>{:term=>"wow"@0}},
    #         {:and_terms=>[{:term=>"lol"@7}, {:term=>"omg"@15}]},
    #         {:and_terms=>{:term=>"bob"@22}}]}}]}
    

    In answer to your full question... In transformers you have to match an entire hash at a time. To get around this you can match 'subtree' but that is usually a hack.

    require 'parslet'
    
    class Parser < Parslet::Parser
      rule(:space)     { str(' ').repeat(1) }
      rule(:word)      { match['^\s"'].repeat(1) }
      rule(:or_op)     { space >> str('OR') >> space }
      rule(:and_op)    { space >> str('AND') >> space }
      rule(:term)      { word.as(:term) }
      rule(:or_terms)  { (term >> (or_op >> term).repeat(0)).as(:or_terms) }
      rule(:clause)    { (or_terms).as(:clause) }
      rule(:query)     { (clause >> space.maybe).repeat.as(:query) }
      root(:query)
    
      def self.parse_tree_for(query)
        new.parse(query)
      end
    end
    
    class MyTransform < Parslet::Transform
      rule(:term => simple(:t)) {t}
      rule(:or_terms => sequence(:terms)){ 
        terms.map{|t| {term:{word:t, or:true}}}
      }
      rule(:or_terms => simple(:cs)){ [{term:{word:cs}}] } # so a single hash looks like a list.
      rule(:query => subtree(:cs)){ {:query => cs.map{|c| c[:clause]}.flatten.map{|c| {clause:c}}}}
    end
    
    pp MyTransform.new.apply(Parser.parse_tree_for('foo bar OR baz'))
    

    This example outputs:

    {:query=>
      [{:clause=>{:term=>{:word=>"foo"@0}}},
       {:clause=>{:term=>{:word=>"bar"@4, :or=>true}}},
       {:clause=>{:term=>{:word=>"baz"@11, :or=>true}}}]}
    

    I'm using the fact that all expressions are or_terms ... and catching the case where there is only a single term to not set or to true. I'm also using the or_terms match to make a single term act like a collection too... so all clauses map to a list. Then when matching the subtree I can flatten the list to get all the terms and wrap them in 'clause' hashes again... Yuk! ;)