parsinghaskellparsec

Handling infix operator


I'm trying to parse a simple language defined as follows:

import Data.Functor.Identity
import Data.Text (Text)
import qualified Data.Text as T
import Text.Parsec
import qualified Text.Parsec.Expr as Expr

data G
  = Low Int
  | Up Int
  | And G G
  | Or G G
  deriving stock (Eq, Show)

parseIt :: Text -> Either ParseError G
parseIt = parse defP "parseIt"

type Parser = Parsec Text ()

defP :: Parser G
defP = goP <* eof
  where
    goP :: Parser G
    goP = Expr.buildExpressionParser table term
    table :: Expr.OperatorTable Text () Identity G
    table = [[binary And "&&", binary Or "||"]]
    term :: Parser G
    term =
      choice
        [ parens goP,
          unary Up ">",
          unary Low "<"
        ]
    binary :: (G -> G -> G) -> String -> Expr.Operator Text () Identity G
    binary func operator = Expr.Infix (string operator >> return func) Expr.AssocLeft
    unary :: (Int -> G) -> String -> Parser G
    unary mkSpec op = do
      void $ string op
      skipSpaces
      mkSpec <$> numP
    parens = between (symbol "(") (symbol ")")
      where
        symbol name = lexeme (string name)
        lexeme p = do x <- p; skipSpaces; return x
    skipSpaces = skipMany space
    numP :: Parser Int
    numP = do
      xs <- many1 digit
      return $ read xs

I have few test cases to exercise it:

import Control.Monad
import Test.Hspec

main :: IO ()
main = hspec spec

spec :: Spec
spec = do
  describe "should be parsed" $ do
    forM_
      [ (">1", Up 1),
        ("< 42", Low 42),
        (">1 && <42", Up 1 `And` Low 42),
        (">1 || <2 && >5", Up 1 `Or` (Low 2 `And` Up 5)),
        ("((>1 || <2)) && >5", (Up 1 `Or` Low 2) `And` Up 5)
      ]
      $ \(raw, expected :: G) ->
        it (T.unpack raw) $ parseIt raw `shouldBe` Right expected

But they fail on binary operators:

should be parsed
  >1 [✔]
  < 42 [✔]
  >1 && <42 [✘]
  >1 || <2 && >5 [✘]
  ((>1 || <2)) && >5 [✘]

Failures:

  test/Spec.hs:29:43: 
  1) parseVersionSpec, should be parsed, >1 && <42
       expected: Right (And (Up 1) (Low 42))
        but got: Left "parseIt" (line 1, column 3):
                 unexpected ' '
                 expecting digit, operator or end of input

  To rerun use: --match "/parseVersionSpec/should be parsed/>1 && <42/"

  test/Spec.hs:29:43: 
  2) parseVersionSpec, should be parsed, >1 || <2 && >5
       expected: Right (Or (Up 1) (And (Low 2) (Up 5)))
        but got: Left "parseIt" (line 1, column 3):
                 unexpected ' '
                 expecting digit, operator or end of input

  To rerun use: --match "/parseVersionSpec/should be parsed/>1 || <2 && >5/"

  test/Spec.hs:29:43: 
  3) parseVersionSpec, should be parsed, ((>1 || <2)) && >5
       expected: Right (And (Or (Up 1) (Low 2)) (Up 5))
        but got: Left "parseIt" (line 1, column 5):
                 unexpected " "
                 expecting digit, operator or ")"

  To rerun use: --match "/parseVersionSpec/should be parsed/((>1 || <2)) && >5/"

Randomized with seed 1024517159

Finished in 0.0016 seconds
5 examples, 3 failures
*** Exception: ExitFailure 1

I cannot find proper examples, any help would be appreciated.


Solution

  • The problem is that, in parsing ">1 && <42", term parses ">1", but leaves a space at the beginning of the remainder of the input stream " && <42", which causes binary to fail.

    To properly handle whitespace, you should write a set of lexemes that each expect to start parsing at non-whitespace and take responsibility for absorbing any trailing whitespace when finished, and then write the rest of your parser in terms of these lexemes only, without using non-lexeme parsers like string.

    Move your lexeme and symbol definitions up to top-level, or at least the level of defP's where clause:

    skipSpaces = skipMany space
    lexeme p = do x <- p; skipSpaces; return x
    symbol name = lexeme (string name)
    

    Define numP as a lexeme:

    numP :: Parser Int
    numP = lexeme $ do
      xs <- many1 digit
      return $ read xs
    

    and in the rest of your parsers, make use only of the lexeme-level parsers numP and symbol.

    For example, replace string/skipSpaces in unary with symbol:

    unary mkSpec op = do
      void $ symbol op
      mkSpec <$> numP
    

    This is a valid lexeme parser, because it parses the lexeme symbol op followed by the lexeme numP. Do the same in binary:

    binary :: (G -> G -> G) -> String -> Expr.Operator Text () Identity G
    binary func operator = Expr.Infix (symbol operator >> return func) Expr.AssocLeft
    

    Also, in your top-most parser defP, allow leading whitespace:

    defP :: Parser G
    defP = skipSpaces *> goP <* eof
    

    Finally, if you actually want && to have higher precedence than ||, you need to replace:

    table = [[binary And "&&", binary Or "||"]]
    

    with:

    table = [[binary And "&&"], [binary Or "||"]]
    

    The resulting parser should pass all your tests:

    defP :: Parser G
    defP = skipSpaces *> goP <* eof
    
      where
    
        goP :: Parser G
        goP = Expr.buildExpressionParser table term
    
        table :: Expr.OperatorTable Text () Identity G
        table = [[binary And "&&"], [binary Or "||"]]
    
        term :: Parser G
        term =
          choice
            [ parens goP,
              unary Up ">",
              unary Low "<"
            ]
    
        binary :: (G -> G -> G) -> String -> Expr.Operator Text () Identity G
        binary func operator = Expr.Infix (symbol operator >> return func) Expr.AssocLeft
    
        unary :: (Int -> G) -> String -> Parser G
        unary mkSpec op = do
          void $ symbol op
          mkSpec <$> numP
    
        parens = between (symbol "(") (symbol ")")
    
        skipSpaces = skipMany space
        lexeme p = do x <- p; skipSpaces; return x
        symbol name = lexeme (string name)
    
        numP :: Parser Int
        numP = lexeme $ do
          xs <- many1 digit
          return $ read xs
    

    Stylistically, you may also find that switching everything to consistent applicative style makes for a nicer looking parser. Given everything's in a where clause, I might also argue that dropping most of the type signatures would be better. They don't do much for readability:

    defP' :: Parser G
    defP' = skipSpaces *> goP <* eof
    
      where
    
        goP :: Parser G
        goP = Expr.buildExpressionParser table term
          where
            table = [[binary And "&&"], [binary Or "||"]]
            binary func operator = Expr.Infix (func <$ symbol operator) Expr.AssocLeft
    
        term = parens goP <|> unary Up ">" <|> unary Low "<"
          where unary mkSpec op = mkSpec <$ symbol op <*> numP
    
        parens = between (symbol "(") (symbol ")")
    
        numP :: Parser Int
        numP = lexeme (read <$> many1 digit)
    
        skipSpaces = skipMany space
        lexeme p = p <* skipSpaces
        symbol name = lexeme (string name)