antlr4

Antlr4 PLSQL Get Token Types


I'm using the following Python code to print the tokens in a PLSQL source file.

from antlr4 import *
from antlr4.tree.Tree import TerminalNodeImpl
from antlr4.tree.Trees import Trees
from PlSqlLexer import PlSqlLexer
from PlSqlParser import PlSqlParser

import sys
import json


def main():
    with open(sys.argv[1], 'r') as file:
        filesrc = file.read()

    lexer = PlSqlLexer(InputStream(filesrc))
    parser = PlSqlParser(CommonTokenStream(lexer))
    tree = parser.sql_script()
    traverse(tree, parser.ruleNames)

def traverse(tree, rule_names, indent = 0):
    tree
    if tree.getText() == "<EOF>":
        return
    elif isinstance(tree, TerminalNodeImpl):
        print("{0}TOKEN='{1}'".format("  " * indent, tree.getText() ))  ## <<< Prints Token
        #print (tree)
        n = 1
    else:
        print("{0}{1}".format("  " * indent, rule_names[tree.getRuleIndex()]))
        for child in tree.children:
            traverse(child, rule_names, indent + 1)

if __name__ == '__main__':
    main()

When run with a PLSQL source file it will give out like this:

  TOKEN='CREATE'
  TOKEN='OR'
  TOKEN='REPLACE'
  TOKEN='PACKAGE'
  TOKEN='BODY'
  TOKEN='pa_temp'
  TOKEN='AS'
  TOKEN='PROCEDURE'
  TOKEN='pr_new_item'
  TOKEN='('
  TOKEN='p_item'
  TOKEN='IN'
  TOKEN='items'
.
.
.

But I would to print also what the token type is (procedure start, variable, table, etc).

I have tried to do print ( json.dumps(tree) ) and print( json.dumps(parser) to see if there anything useful but this just errors like:

TypeError: Object of type PlSqlLexer is not JSON serializable


Solution

  • I have a small utility method that converts the parse tree ANTLR produces into a dict, which can be converted into json. Let's say your grammar look like:

    grammar Expr;
    
    parse
     : expr EOF
     ;
    
    expr
     : MIN expr              #unaryExpr
     | expr (MUL | DIV) expr #mulExpr
     | expr (ADD | MIN) expr #addExpr
     | OPAR expr  CPAR       #nestedExpr
     | NUM                   #numExpr
     ;
    
    NUM : [0-9]+ ('.' [0-9]+)?;
    MUL : '*';
    DIV : '/';
    ADD : '+';
    MIN : '-';
    OPAR : '(';
    CPAR : ')';
    SPACES : [ \t\r\n]+ -> skip;
    

    then the input 2 * (555 - -50) / 42 will be parsed as:

    enter image description here

    You can use the following Python code to convert the parse tree to a dict:

    import antlr4
    import json
    from ExprLexer import ExprLexer
    from ExprParser import ExprParser
    
    
    def to_dict(root, rule_names):
        dictionary = {}
        __traverse(root, dictionary, rule_names)
        return dictionary
    
    
    def __traverse(tree, dictionary, symbolic_lexer_names):
        if tree.getChildCount() == 0:
            dictionary['type'] = tree.symbol.type
            dictionary['name'] = 'EOF' if tree.symbol.type == -1 else symbolic_lexer_names[tree.symbol.type]
            dictionary['text'] = tree.symbol.text
        else:
            name = f'{str(type(tree).__name__)[0].lower()}{str(type(tree).__name__)[1:]}'.replace('Context', '')
            dictionary[name] = []
            for i in range(0, tree.getChildCount()):
                nested = {}
                dictionary[name].append(nested)
                __traverse(tree.getChild(i), nested, symbolic_lexer_names)
    
    
    def main(source):
        lexer = ExprLexer(antlr4.InputStream(source))
        parser = ExprParser(antlr4.CommonTokenStream(lexer))
        dictionary = to_dict(parser.parse(), lexer.symbolicNames)
        print(json.dumps(dictionary, indent=2))
    
    
    if __name__ == '__main__':
        main('2 * (555 - -50) / 42')
    

    which will print:

    {
      "parse": [
        {
          "mulExpr": [
            {
              "mulExpr": [
                {
                  "numExpr": [
                    {
                      "type": 1,
                      "name": "NUM",
                      "text": "2"
                    }
                  ]
                },
                {
                  "type": 2,
                  "name": "MUL",
                  "text": "*"
                },
                {
                  "nestedExpr": [
                    {
                      "type": 6,
                      "name": "OPAR",
                      "text": "("
                    },
                    {
                      "addExpr": [
                        {
                          "numExpr": [
                            {
                              "type": 1,
                              "name": "NUM",
                              "text": "555"
                            }
                          ]
                        },
                        {
                          "type": 5,
                          "name": "MIN",
                          "text": "-"
                        },
                        {
                          "unaryExpr": [
                            {
                              "type": 5,
                              "name": "MIN",
                              "text": "-"
                            },
                            {
                              "numExpr": [
                                {
                                  "type": 1,
                                  "name": "NUM",
                                  "text": "50"
                                }
                              ]
                            }
                          ]
                        }
                      ]
                    },
                    {
                      "type": 7,
                      "name": "CPAR",
                      "text": ")"
                    }
                  ]
                }
              ]
            },
            {
              "type": 3,
              "name": "DIV",
              "text": "/"
            },
            {
              "numExpr": [
                {
                  "type": 1,
                  "name": "NUM",
                  "text": "42"
                }
              ]
            }
          ]
        },
        {
          "type": -1,
          "name": "EOF",
          "text": "<EOF>"
        }
      ]
    }