I'm using the following Python code to print the tokens in a PLSQL source file.
from antlr4 import *
from antlr4.tree.Tree import TerminalNodeImpl
from antlr4.tree.Trees import Trees
from PlSqlLexer import PlSqlLexer
from PlSqlParser import PlSqlParser
import sys
import json
def main():
with open(sys.argv[1], 'r') as file:
filesrc = file.read()
lexer = PlSqlLexer(InputStream(filesrc))
parser = PlSqlParser(CommonTokenStream(lexer))
tree = parser.sql_script()
traverse(tree, parser.ruleNames)
def traverse(tree, rule_names, indent = 0):
tree
if tree.getText() == "<EOF>":
return
elif isinstance(tree, TerminalNodeImpl):
print("{0}TOKEN='{1}'".format(" " * indent, tree.getText() )) ## <<< Prints Token
#print (tree)
n = 1
else:
print("{0}{1}".format(" " * indent, rule_names[tree.getRuleIndex()]))
for child in tree.children:
traverse(child, rule_names, indent + 1)
if __name__ == '__main__':
main()
When run with a PLSQL source file it will give out like this:
TOKEN='CREATE'
TOKEN='OR'
TOKEN='REPLACE'
TOKEN='PACKAGE'
TOKEN='BODY'
TOKEN='pa_temp'
TOKEN='AS'
TOKEN='PROCEDURE'
TOKEN='pr_new_item'
TOKEN='('
TOKEN='p_item'
TOKEN='IN'
TOKEN='items'
.
.
.
But I would to print also what the token type is (procedure start, variable, table, etc).
I have tried to do print ( json.dumps(tree) )
and print( json.dumps(parser)
to see if there anything useful but this just errors like:
TypeError: Object of type PlSqlLexer is not JSON serializable
I have a small utility method that converts the parse tree ANTLR produces into a dict
, which can be converted into json. Let's say your grammar look like:
grammar Expr;
parse
: expr EOF
;
expr
: MIN expr #unaryExpr
| expr (MUL | DIV) expr #mulExpr
| expr (ADD | MIN) expr #addExpr
| OPAR expr CPAR #nestedExpr
| NUM #numExpr
;
NUM : [0-9]+ ('.' [0-9]+)?;
MUL : '*';
DIV : '/';
ADD : '+';
MIN : '-';
OPAR : '(';
CPAR : ')';
SPACES : [ \t\r\n]+ -> skip;
then the input 2 * (555 - -50) / 42
will be parsed as:
You can use the following Python code to convert the parse tree to a dict
:
import antlr4
import json
from ExprLexer import ExprLexer
from ExprParser import ExprParser
def to_dict(root, rule_names):
dictionary = {}
__traverse(root, dictionary, rule_names)
return dictionary
def __traverse(tree, dictionary, symbolic_lexer_names):
if tree.getChildCount() == 0:
dictionary['type'] = tree.symbol.type
dictionary['name'] = 'EOF' if tree.symbol.type == -1 else symbolic_lexer_names[tree.symbol.type]
dictionary['text'] = tree.symbol.text
else:
name = f'{str(type(tree).__name__)[0].lower()}{str(type(tree).__name__)[1:]}'.replace('Context', '')
dictionary[name] = []
for i in range(0, tree.getChildCount()):
nested = {}
dictionary[name].append(nested)
__traverse(tree.getChild(i), nested, symbolic_lexer_names)
def main(source):
lexer = ExprLexer(antlr4.InputStream(source))
parser = ExprParser(antlr4.CommonTokenStream(lexer))
dictionary = to_dict(parser.parse(), lexer.symbolicNames)
print(json.dumps(dictionary, indent=2))
if __name__ == '__main__':
main('2 * (555 - -50) / 42')
which will print:
{
"parse": [
{
"mulExpr": [
{
"mulExpr": [
{
"numExpr": [
{
"type": 1,
"name": "NUM",
"text": "2"
}
]
},
{
"type": 2,
"name": "MUL",
"text": "*"
},
{
"nestedExpr": [
{
"type": 6,
"name": "OPAR",
"text": "("
},
{
"addExpr": [
{
"numExpr": [
{
"type": 1,
"name": "NUM",
"text": "555"
}
]
},
{
"type": 5,
"name": "MIN",
"text": "-"
},
{
"unaryExpr": [
{
"type": 5,
"name": "MIN",
"text": "-"
},
{
"numExpr": [
{
"type": 1,
"name": "NUM",
"text": "50"
}
]
}
]
}
]
},
{
"type": 7,
"name": "CPAR",
"text": ")"
}
]
}
]
},
{
"type": 3,
"name": "DIV",
"text": "/"
},
{
"numExpr": [
{
"type": 1,
"name": "NUM",
"text": "42"
}
]
}
]
},
{
"type": -1,
"name": "EOF",
"text": "<EOF>"
}
]
}