rubyparsingmarkdownparslet

Parse markdown indented code block


I am trying to parse Markdown using a grammar written with Parslet. However, I cannot get past indented code blocks because everything I tried so far got stuck in recursion. They look like this:

    This is a indented code block.
    Second line.

    Code block continues after blank line.

    There can be any number of chunks, 
    separated by not more than one blank line.

In order to solve this I wrote a minimal example wich replaces the lines (including \n) with a and blank lines (\n\n) with spaces, eg: a aaa aa.

# recurring_group_parser.rb

require 'parslet'
require 'rspec'
require 'parslet/rig/rspec'

class RecurringGroupParser < Parslet::Parser
  root(:block)

  rule :block do
    chunk.repeat(1,3)
  end

  rule :chunk do
    str('a').repeat(1,3) >> space
  end

  rule :space do
    str(' ') | chunk.absent?
  end
end

describe RecurringGroupParser do
  it 'should parse a' do
    is_expected.to parse "a"
  end

  it 'should parse aa' do
    is_expected.to parse "aa"
  end

  it 'should parse aaa' do
    is_expected.to parse "aaa"
  end

  it 'should parse a a' do
    is_expected.to parse "a a"
  end

  it 'should parse aa a' do
    is_expected.to parse "aa a"
  end

  it 'should parse aaa a' do
    is_expected.to parse "aaa a"
  end

  it 'should parse a aa' do
    is_expected.to parse "a aa"
  end

  it 'should parse a aaa' do
    is_expected.to parse "a aaa"
  end

  it 'should parse aa a' do
    is_expected.to parse "aa a"
  end

  it 'should parse aa aa' do
    is_expected.to parse "aa aa"
  end

  it 'should parse aa aaa' do
    is_expected.to parse "aa aaa"
  end

  it 'should parse aaa aa' do
    is_expected.to parse "aaa aa"
  end

  it 'should parse aaa aaa' do
    is_expected.to parse "aaa aaa"
  end

  it 'should parse a a a' do
    is_expected.to parse "a a a"
  end

  it 'should parse aa a a' do
    is_expected.to parse "aa a a"
  end

  it 'should parse aaa a a' do
    is_expected.to parse "aaa a a"
  end

  it 'should parse a aa a' do
    is_expected.to parse "a aa a"
  end

  it 'should parse aa aa a' do
    is_expected.to parse "aa aa a"
  end

  it 'should parse aaa aa a' do
    is_expected.to parse "aaa aa a"
  end

  it 'should parse a aaa a' do
    is_expected.to parse "a aaa a"
  end

  it 'should parse aa aaa a' do
    is_expected.to parse "aa aaa a"
  end

  it 'should parse aaa aaa a' do
    is_expected.to parse "aaa aaa a"
  end

  it 'should parse a a aa' do
    is_expected.to parse "a a aa"
  end

  it 'should parse aa a aa' do
    is_expected.to parse "aa a aa"
  end

  it 'should parse aaa a aa' do
    is_expected.to parse "aaa a aa"
  end

  it 'should parse a aa aa' do
    is_expected.to parse "a aa aa"
  end

  it 'should parse aa aa aa' do
    is_expected.to parse "aa aa aa"
  end

  it 'should parse aaa aa aa' do
    is_expected.to parse "aaa aa aa"
  end

  it 'should parse a aaa aa' do
    is_expected.to parse "a aaa aa"
  end

  it 'should parse aa aaa aa' do
    is_expected.to parse "aa aaa aa"
  end

  it 'should parse aaa aaa aa' do
    is_expected.to parse "aaa aaa aa"
  end

  it 'should parse a a aaa' do
    is_expected.to parse "a a aaa"
  end

  it 'should parse aa a aaa' do
    is_expected.to parse "aa a aaa"
  end

  it 'should parse aaa a aaa' do
    is_expected.to parse "aaa a aaa"
  end

  it 'should parse a aa aaa' do
    is_expected.to parse "a aa aaa"
  end

  it 'should parse aa aa aaa' do
    is_expected.to parse "aa aa aaa"
  end

  it 'should parse aaa aa aaa' do
    is_expected.to parse "aaa aa aaa"
  end

  it 'should parse a aaa aaa' do
    is_expected.to parse "a aaa aaa"
  end

  it 'should parse aa aaa aaa' do
    is_expected.to parse "aa aaa aaa"
  end

  it 'should parse aaa aaa aaa' do
    is_expected.to parse "aaa aaa aaa"
  end
end

Running rspec recurring_group_parser.rb works fine. Only when I put the newlines back in, it stalls:

# recurring_group_parser.rb

require 'parslet'
require 'rspec'
require 'parslet/rig/rspec'

class RecurringGroupParser < Parslet::Parser
  root(:block)

  rule :block do
    chunk.repeat(1,3)
  end

  rule :chunk do
    line.repeat(1,3) >> blank_line
  end

  rule :line do
    str('a') >> newline
  end

  rule :blank_line do
    newline.repeat(2) | chunk.absent?
  end

  rule :newline do
    str("\n") | any.absent?
  end
end

describe RecurringGroupParser do
  it 'should parse a' do
    is_expected.to parse "a"
  end

  it 'should parse aa' do
    is_expected.to parse "a\na"
  end

  it 'should parse aaa' do
    is_expected.to parse "a\na\na"
  end

  it 'should parse a a' do
    is_expected.to parse "a\n\na"
  end

  it 'should parse aa a' do
    is_expected.to parse "a\na\n\na"
  end

  it 'should parse aaa a' do
    is_expected.to parse "a\naa\n\na"
  end

  it 'should parse a aa' do
    is_expected.to parse "a\n\na\na"
  end

  it 'should parse a aaa' do
    is_expected.to parse "a\n\na\na\na"
  end

  it 'should parse aa a' do
    is_expected.to parse "a\na\n\na"
  end

  it 'should parse aa aa' do
    is_expected.to parse "a\na\n\na\na"
  end

  it 'should parse aa aaa' do
    is_expected.to parse "a\na\n\na\na\na"
  end

  it 'should parse aaa aa' do
    is_expected.to parse "a\naa\n\na\na"
  end

  it 'should parse aaa aaa' do
    is_expected.to parse "a\naa\n\na\na\na"
  end

  it 'should parse a a a' do
    is_expected.to parse "a\n\na\n\na"
  end

  it 'should parse aa a a' do
    is_expected.to parse "a\na\n\na\n\na"
  end

  it 'should parse aaa a a' do
    is_expected.to parse "a\naa\n\na\n\na"
  end

  it 'should parse a aa a' do
    is_expected.to parse "a\n\na\na\n\na"
  end

  it 'should parse aa aa a' do
    is_expected.to parse "a\na\n\na\na\n\na"
  end

  it 'should parse aaa aa a' do
    is_expected.to parse "a\naa\n\na\na\n\na"
  end

  it 'should parse a aaa a' do
    is_expected.to parse "a\n\na\naa\n\na"
  end

  it 'should parse aa aaa a' do
    is_expected.to parse "a\na\n\na\naa\n\na"
  end

  it 'should parse aaa aaa a' do
    is_expected.to parse "a\naa\n\na\naa\n\na"
  end

  it 'should parse a a aa' do
    is_expected.to parse "a\n\na\n\na\na"
  end

  it 'should parse aa a aa' do
    is_expected.to parse "a\na\n\na\n\na\na"
  end

  it 'should parse aaa a aa' do
    is_expected.to parse "a\naa\n\na\n\na\na"
  end

  it 'should parse a aa aa' do
    is_expected.to parse "a\n\na\na\n\na\na"
  end

  it 'should parse aa aa aa' do
    is_expected.to parse "a\na\n\na\na\n\na\na"
  end

  it 'should parse aaa aa aa' do
    is_expected.to parse "a\naa\n\na\na\n\na\na"
  end

  it 'should parse a aaa aa' do
    is_expected.to parse "a\n\na\naa\n\na\na"
  end

  it 'should parse aa aaa aa' do
    is_expected.to parse "a\na\n\na\naa\n\na\na"
  end

  it 'should parse aaa aaa aa' do
    is_expected.to parse "a\naa\n\na\naa\n\na\na"
  end

  it 'should parse a a aaa' do
    is_expected.to parse "a\n\na\n\na\na\na"
  end

  it 'should parse aa a aaa' do
    is_expected.to parse "a\na\n\na\n\na\na\na"
  end

  it 'should parse aaa a aaa' do
    is_expected.to parse "a\naa\n\na\n\na\na\na"
  end

  it 'should parse a aa aaa' do
    is_expected.to parse "a\n\na\na\n\na\na\na"
  end

  it 'should parse aa aa aaa' do
    is_expected.to parse "a\na\n\na\na\n\na\na\na"
  end

  it 'should parse aaa aa aaa' do
    is_expected.to parse "a\naa\n\na\na\n\na\na\na"
  end

  it 'should parse a aaa aaa' do
    is_expected.to parse "a\n\na\naa\n\na\na\na"
  end

  it 'should parse aa aaa aaa' do
    is_expected.to parse "a\na\n\na\naa\n\na\na\na"
  end

  it 'should parse aaa aaa aaa' do
    is_expected.to parse "a\naa\n\na\naa\n\na\na\na"
  end
end

To simplify this, lines can only consist of a single a and are not indented but that can easily be changed later and are not related to the failure to finish parsing. I am also pretty sure that there is a collision between chunk.absent? in rule :blank_line and any.absent? in rule :newline but I have no idea how to fix this and provide criteria to break the recursion. Any help wanted!


Solution

  • In this case newline can be eof. In which case newline. repeat(2) repeatedly matches eof. You want "repeat(2,2)". You can made these bugs easy to find :)... Just use my fork.

    You can detect how it's looping by using my fork of parslet. It catches loops and tells you what's happening. It's slower than the usual parslet, so switch back for production parsing.

    Use this Gemfile:

    source "https://rubygems.org"
    
    gem "parslet" , :git => "https://github.com/NigelThorne/parslet.git"
    gem 'rspec'
    

    And you get these results:

     9:23:40.20 > bundle exec rspec parser.rb
    FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
    
    Failures:
    
      1) RecurringGroupParser should parse a
         Failure/Error: is_expected.to parse "a"
         RuntimeError:
           Grammar contains an infinite loop applying 'NEWLINE{2, }' at char position 1
           ...a<-- here
         # ./parser.rb:33:in `block (2 levels) in <top (required)>'
    
      2) RecurringGroupParser should parse aa
         Failure/Error: is_expected.to parse "a\na"
         RuntimeError:
           Grammar contains an infinite loop applying 'NEWLINE{2, }' at char position 3
           ...a
           a<-- here
         # ./parser.rb:37:in `block (2 levels) in <top (required)>'
    
      3) RecurringGroupParser should parse aaa
         Failure/Error: is_expected.to parse "a\na\na"
         RuntimeError:
           Grammar contains an infinite loop applying 'NEWLINE{2, }' at char position 5
           ...a
           a
           a<-- here
         # ./parser.rb:41:in `block (2 levels) in <top (required)>'
    
      4) RecurringGroupParser should parse a a
         Failure/Error: is_expected.to parse "a\n\na"
           expected BLOCK to be able to parse "a\n\na"
         # ./parser.rb:45:in `block (2 levels) in <top (required)>'
    
      5) RecurringGroupParser should parse aa a
         Failure/Error: is_expected.to parse "a\na\n\na"
           expected BLOCK to be able to parse "a\na\n\na"
         # ./parser.rb:49:in `block (2 levels) in <top (required)>'
    
      6) RecurringGroupParser should parse aaa a
         Failure/Error: is_expected.to parse "a\naa\n\na"
           expected BLOCK to be able to parse "a\naa\n\na"
         # ./parser.rb:53:in `block (2 levels) in <top (required)>'
    
      7) RecurringGroupParser should parse a aa
         Failure/Error: is_expected.to parse "a\n\na\na"
           expected BLOCK to be able to parse "a\n\na\na"
         # ./parser.rb:57:in `block (2 levels) in <top (required)>'
    
      8) RecurringGroupParser should parse a aaa
         Failure/Error: is_expected.to parse "a\n\na\na\na"
           expected BLOCK to be able to parse "a\n\na\na\na"
         # ./parser.rb:61:in `block (2 levels) in <top (required)>'
    
      9) RecurringGroupParser should parse aa a
         Failure/Error: is_expected.to parse "a\na\n\na"
           expected BLOCK to be able to parse "a\na\n\na"
         # ./parser.rb:65:in `block (2 levels) in <top (required)>'
    
      10) RecurringGroupParser should parse aa aa
         Failure/Error: is_expected.to parse "a\na\n\na\na"
           expected BLOCK to be able to parse "a\na\n\na\na"
         # ./parser.rb:69:in `block (2 levels) in <top (required)>'
    
      11) RecurringGroupParser should parse aa aaa
         Failure/Error: is_expected.to parse "a\na\n\na\na\na"
           expected BLOCK to be able to parse "a\na\n\na\na\na"
         # ./parser.rb:73:in `block (2 levels) in <top (required)>'
    
      12) RecurringGroupParser should parse aaa aa
         Failure/Error: is_expected.to parse "a\naa\n\na\na"
           expected BLOCK to be able to parse "a\naa\n\na\na"
         # ./parser.rb:77:in `block (2 levels) in <top (required)>'
    
      13) RecurringGroupParser should parse aaa aaa
         Failure/Error: is_expected.to parse "a\naa\n\na\na\na"
           expected BLOCK to be able to parse "a\naa\n\na\na\na"
         # ./parser.rb:81:in `block (2 levels) in <top (required)>'
    
      14) RecurringGroupParser should parse a a a
         Failure/Error: is_expected.to parse "a\n\na\n\na"
           expected BLOCK to be able to parse "a\n\na\n\na"
         # ./parser.rb:85:in `block (2 levels) in <top (required)>'
    
      15) RecurringGroupParser should parse aa a a
         Failure/Error: is_expected.to parse "a\na\n\na\n\na"
           expected BLOCK to be able to parse "a\na\n\na\n\na"
         # ./parser.rb:89:in `block (2 levels) in <top (required)>'
    
      16) RecurringGroupParser should parse aaa a a
         Failure/Error: is_expected.to parse "a\naa\n\na\n\na"
           expected BLOCK to be able to parse "a\naa\n\na\n\na"
         # ./parser.rb:93:in `block (2 levels) in <top (required)>'
    
      17) RecurringGroupParser should parse a aa a
         Failure/Error: is_expected.to parse "a\n\na\na\n\na"
           expected BLOCK to be able to parse "a\n\na\na\n\na"
         # ./parser.rb:97:in `block (2 levels) in <top (required)>'
    
      18) RecurringGroupParser should parse aa aa a
         Failure/Error: is_expected.to parse "a\na\n\na\na\n\na"
           expected BLOCK to be able to parse "a\na\n\na\na\n\na"
         # ./parser.rb:101:in `block (2 levels) in <top (required)>'
    
      19) RecurringGroupParser should parse aaa aa a
         Failure/Error: is_expected.to parse "a\naa\n\na\na\n\na"
           expected BLOCK to be able to parse "a\naa\n\na\na\n\na"
         # ./parser.rb:105:in `block (2 levels) in <top (required)>'
    
      20) RecurringGroupParser should parse a aaa a
         Failure/Error: is_expected.to parse "a\n\na\naa\n\na"
           expected BLOCK to be able to parse "a\n\na\naa\n\na"
         # ./parser.rb:109:in `block (2 levels) in <top (required)>'
    
      21) RecurringGroupParser should parse aa aaa a
         Failure/Error: is_expected.to parse "a\na\n\na\naa\n\na"
           expected BLOCK to be able to parse "a\na\n\na\naa\n\na"
         # ./parser.rb:113:in `block (2 levels) in <top (required)>'
    
      22) RecurringGroupParser should parse aaa aaa a
         Failure/Error: is_expected.to parse "a\naa\n\na\naa\n\na"
           expected BLOCK to be able to parse "a\naa\n\na\naa\n\na"
         # ./parser.rb:117:in `block (2 levels) in <top (required)>'
    
      23) RecurringGroupParser should parse a a aa
         Failure/Error: is_expected.to parse "a\n\na\n\na\na"
           expected BLOCK to be able to parse "a\n\na\n\na\na"
         # ./parser.rb:121:in `block (2 levels) in <top (required)>'
    
      24) RecurringGroupParser should parse aa a aa
         Failure/Error: is_expected.to parse "a\na\n\na\n\na\na"
           expected BLOCK to be able to parse "a\na\n\na\n\na\na"
         # ./parser.rb:125:in `block (2 levels) in <top (required)>'
    
      25) RecurringGroupParser should parse aaa a aa
         Failure/Error: is_expected.to parse "a\naa\n\na\n\na\na"
           expected BLOCK to be able to parse "a\naa\n\na\n\na\na"
         # ./parser.rb:129:in `block (2 levels) in <top (required)>'
    
      26) RecurringGroupParser should parse a aa aa
         Failure/Error: is_expected.to parse "a\n\na\na\n\na\na"
           expected BLOCK to be able to parse "a\n\na\na\n\na\na"
         # ./parser.rb:133:in `block (2 levels) in <top (required)>'
    
      27) RecurringGroupParser should parse aa aa aa
         Failure/Error: is_expected.to parse "a\na\n\na\na\n\na\na"
           expected BLOCK to be able to parse "a\na\n\na\na\n\na\na"
         # ./parser.rb:137:in `block (2 levels) in <top (required)>'
    
      28) RecurringGroupParser should parse aaa aa aa
         Failure/Error: is_expected.to parse "a\naa\n\na\na\n\na\na"
           expected BLOCK to be able to parse "a\naa\n\na\na\n\na\na"
         # ./parser.rb:141:in `block (2 levels) in <top (required)>'
    
      29) RecurringGroupParser should parse a aaa aa
         Failure/Error: is_expected.to parse "a\n\na\naa\n\na\na"
           expected BLOCK to be able to parse "a\n\na\naa\n\na\na"
         # ./parser.rb:145:in `block (2 levels) in <top (required)>'
    
      30) RecurringGroupParser should parse aa aaa aa
         Failure/Error: is_expected.to parse "a\na\n\na\naa\n\na\na"
           expected BLOCK to be able to parse "a\na\n\na\naa\n\na\na"
         # ./parser.rb:149:in `block (2 levels) in <top (required)>'
    
      31) RecurringGroupParser should parse aaa aaa aa
         Failure/Error: is_expected.to parse "a\naa\n\na\naa\n\na\na"
           expected BLOCK to be able to parse "a\naa\n\na\naa\n\na\na"
         # ./parser.rb:153:in `block (2 levels) in <top (required)>'
    
      32) RecurringGroupParser should parse a a aaa
         Failure/Error: is_expected.to parse "a\n\na\n\na\na\na"
           expected BLOCK to be able to parse "a\n\na\n\na\na\na"
         # ./parser.rb:157:in `block (2 levels) in <top (required)>'
    
      33) RecurringGroupParser should parse aa a aaa
         Failure/Error: is_expected.to parse "a\na\n\na\n\na\na\na"
           expected BLOCK to be able to parse "a\na\n\na\n\na\na\na"
         # ./parser.rb:161:in `block (2 levels) in <top (required)>'
    
      34) RecurringGroupParser should parse aaa a aaa
         Failure/Error: is_expected.to parse "a\naa\n\na\n\na\na\na"
           expected BLOCK to be able to parse "a\naa\n\na\n\na\na\na"
         # ./parser.rb:165:in `block (2 levels) in <top (required)>'
    
      35) RecurringGroupParser should parse a aa aaa
         Failure/Error: is_expected.to parse "a\n\na\na\n\na\na\na"
           expected BLOCK to be able to parse "a\n\na\na\n\na\na\na"
         # ./parser.rb:169:in `block (2 levels) in <top (required)>'
    
      36) RecurringGroupParser should parse aa aa aaa
         Failure/Error: is_expected.to parse "a\na\n\na\na\n\na\na\na"
           expected BLOCK to be able to parse "a\na\n\na\na\n\na\na\na"
         # ./parser.rb:173:in `block (2 levels) in <top (required)>'
    
      37) RecurringGroupParser should parse aaa aa aaa
         Failure/Error: is_expected.to parse "a\naa\n\na\na\n\na\na\na"
           expected BLOCK to be able to parse "a\naa\n\na\na\n\na\na\na"
         # ./parser.rb:177:in `block (2 levels) in <top (required)>'
    
      38) RecurringGroupParser should parse a aaa aaa
         Failure/Error: is_expected.to parse "a\n\na\naa\n\na\na\na"
           expected BLOCK to be able to parse "a\n\na\naa\n\na\na\na"
         # ./parser.rb:181:in `block (2 levels) in <top (required)>'
    
      39) RecurringGroupParser should parse aa aaa aaa
         Failure/Error: is_expected.to parse "a\na\n\na\naa\n\na\na\na"
           expected BLOCK to be able to parse "a\na\n\na\naa\n\na\na\na"
         # ./parser.rb:185:in `block (2 levels) in <top (required)>'
    
      40) RecurringGroupParser should parse aaa aaa aaa
         Failure/Error: is_expected.to parse "a\naa\n\na\naa\n\na\na\na"
           expected BLOCK to be able to parse "a\naa\n\na\naa\n\na\na\na"
         # ./parser.rb:189:in `block (2 levels) in <top (required)>'
    
    Finished in 0.01702 seconds (files took 0.26725 seconds to load)
    40 examples, 40 failures    
    

    see this question on parsing indentation with Parslet.