haskellparsec

How to extract comments from String using Parsec?


I'm trying to parse just comments from a String and I'm close but not quite there.

import Text.ParserCombinators.Parsec

parseSingleLineComment :: Parser String
parseSingleLineComment = do 
    string "//" 
    x <- manyTill anyChar newline
    spaces 
    return x
parseMultilineComment :: Parser String
parseMultilineComment = do
    string "/*" 
    x <- manyTill anyChar (string "*/")
    spaces
    return x
parseEndOfFile :: Parser String
parseEndOfFile = do 
  x <- eof
  return ""

parseComment :: Parser String
parseComment = try parseSingleLineComment <|> try parseMultilineComment
    
parseNotComment :: Parser String
parseNotComment = manyTill anyChar (lookAhead (try parseComment <|> parseEndOfFile))

extractComments :: Parser [String]
extractComments = do
  manyTill anyChar (lookAhead (parseComment <|> parseEndOfFile))
  xs <- try $ sepEndBy1 parseComment parseNotComment
  eof
  return $ xs


printHelperF :: String -> IO ()
printHelperF s = do
  print s
  print $ parse extractComments "Test Parser" s
  print "-------------------"

-- main
main :: IO ()
main = do 
  let sample0 = "No comments here"
  let sample1 = "//Hello there!\n//General Kenobi"
  let sample2 = "/* What's the deal with airline food?\nIt keeps getting worse and worse\nI can't take it anymore!*/"
  let sample3 = " //Global Variable\nlet x = 5;\n/*TODO:\n\t// Add the number of cats as a variable\n\t//Shouldn't take too long\n*/\nlet c = 500;"
  let sample4 = "//First\n//Second//NotThird\n//Third"
  let samples = [sample0, sample1, sample2, sample3, sample4]
  mapM_ printHelperF samples


-- > runhaskell test.hs
-- "No comments here"
-- Left "Test Parser" (line 1, column 17):
-- unexpected end of input
-- expecting "//" or "/*" <---------- fails because no comment in string
-- "-------------------"
-- "//Hello there!\n//General Kenobi"
-- Right ["Hello there!"] <---------- fails to extract the last comment
-- "-------------------"
-- "/* What's the deal with airline food?\nIt keeps getting worse and worse\nI can't take it anymore!*/"
-- Right [" What's the deal with airline food?\nIt keeps getting worse and worse\nI can't take it anymore!"] <- correct
-- "-------------------"
-- " //Global Variable\nlet x = 5;\n/*TODO:\n\t// Add the number of cats as a variable\n\t//Shouldn't take too long\n*/\nlet c = 500;"
-- Right ["Global Variable","TODO:\n\t// Add the number of cats as a variable\n\t//Shouldn't take too long\n"] <- correct
-- "-------------------"
-- "//First\n//Second//NotThird\n//Third"
-- Right ["First","Second//NotThird"] <- again fails to extract the last comment
-- "-------------------"

Solution

  • If you replace sepEndBy1 with sepEndBy, that should take care of the problem with the "no comments" case failing.

    To handle the case of a final single-line comment with no terminating newline, try using:

    parseSingleLineComment :: Parser String
    parseSingleLineComment = do
        string "//"
        noneOf "\n"
    

    After making these changes, there are several other test cases you should consider. Asterisks in multiline comments cause the comment to be ignored.

    λ> printHelperF "x = 3*4 /* not 3*5 */"
    "x = 3*4 /* not 3*5 */"
    Right []
    "-------------------"
    

    To fix this, you'll need something like:

    parseMultilineComment :: Parser String
    parseMultilineComment = do
        string "/*"
        manyTill anyChar (try (string "*/"))
    

    Also, unterminated multiline comments are treated as code:

    > printHelperF "/* unterminated comment"
    "/* unterminated comment"
    Right []
    "-------------------"
    

    This should probably be a parse error instead. Fixing this involves moving around some try logic. Take the try calls out of parseComment:

    parseComment :: Parser String
    parseComment = parseSingleLineComment <|> parseMultilineComment
    

    and move them into the sub-functions:

    parseSingleLineComment :: Parser String
    parseSingleLineComment = do
        try (string "//")
        many (noneOf "\n")
    
    parseMultilineComment :: Parser String
    parseMultilineComment = do
        try (string "/*")
        manyTill anyChar (try (string "*/"))
    

    The way this version of parseMultilineComment works is that a lone / character will cause the first parser to fail, but the try will ensure that no input is consumed (i.e., no comment was found). On the other hand, if string "/*" succeeds, then manyTill will search for the terminating string "*/". If this it isn't found, the parser will fail but after consuming input (namely, the string "/*"). This will result in a parse error instead.

    For this to work correctly, we need to get rid of the try in parseNotComment:

    parseNotComment :: Parser String
    parseNotComment = manyTill anyChar (lookAhead (parseComment <|> parseEndOfFile))
    

    and we can also simplify extractComments, since its first line is now identical to parseNotComment, and the other try is redundant:

    extractComments :: Parser [String]
    extractComments = do
      parseNotComment
      xs <- sepEndBy parseComment parseNotComment
      eof
      return $ xs
    

    The final result should pass your tests, plus a few more:

    module Comments where
    
    import Text.ParserCombinators.Parsec
    
    parseSingleLineComment :: Parser String
    parseSingleLineComment = do
        try (string "//")
        many (noneOf "\n")
    
    parseMultilineComment :: Parser String
    parseMultilineComment = do
        try (string "/*")
        manyTill anyChar (try (string "*/"))
    
    parseEndOfFile :: Parser String
    parseEndOfFile = do
        x <- eof
        return ""
    
    parseComment :: Parser String
    parseComment = parseSingleLineComment <|> parseMultilineComment
    
    parseNotComment :: Parser String
    parseNotComment = manyTill anyChar (lookAhead (parseComment <|> parseEndOfFile))
    
    extractComments :: Parser [String]
    extractComments = do
      parseNotComment
      xs <- sepEndBy parseComment parseNotComment
      eof
      return $ xs
    
    
    printHelperF :: String -> IO ()
    printHelperF s = do
      print s
      print $ parse extractComments "Test Parser" s
      print "-------------------"
    
    -- main
    main :: IO ()
    main = do
      let sample0 = "No comments here"
      let sample1 = "//Hello there!\n//General Kenobi"
      let sample2 = "/* What's the deal with airline food?\nIt keeps getting worse and worse\nI can't take it anymore!*/"
      let sample3 = " //Global Variable\nlet x = 5;\n/*TODO:\n\t// Add the number of cats as a variable\n\t//Shouldn't take too long\n*/\nlet c = 500;"
      let sample4 = "//First\n//Second//NotThird\n//Third"
      let sample5 = "x = 3*4 /* not 3*5 */"
      let sample6 = "/* unterminated comment"
      let sample6 = "/* foo */ /* unterminated comment"
      let sample7 = ""
      let samples = [sample0, sample1, sample2, sample3, sample4, sample5, sample6, sample7]
      mapM_ printHelperF samples
    

    giving output:

    "No comments here"
    Right []
    "-------------------"
    "//Hello there!\n//General Kenobi"
    Right ["Hello there!","General Kenobi"]
    "-------------------"
    "/* What's the deal with airline food?\nIt keeps getting worse and worse\nI can't take it anymore!*/"
    Right [" What's the deal with airline food?\nIt keeps getting worse and worse\nI can't take it anymore!"]
    "-------------------"
    " //Global Variable\nlet x = 5;\n/*TODO:\n\t// Add the number of cats as a variable\n\t//Shouldn't take too long\n*/\nlet c = 500;"
    Right ["Global Variable","TODO:\n\t// Add the number of cats as a variable\n\t//Shouldn't take too long\n"]
    "-------------------"
    "//First\n//Second//NotThird\n//Third"
    Right ["First","Second//NotThird","Third"]
    "-------------------"
    "x = 3*4 /* not 3*5 */"
    Right [" not 3*5 "]
    "-------------------"
    "/* foo */ /* unterminated comment"
    Left "Test Parser" (line 1, column 34):
    unexpected end of input
    expecting "*/"
    "-------------------"
    ""
    Right []
    "-------------------"