sqlfluff can generate a parsed query as a dict using the function .parse
, like in the example:
import json
import sqlfluff
print(json.dumps(sqlfluff.parse(
"""with cte1 as (select colA from table1), cte2 as (select colB from table2 inner join table1 on table2.colB = table1.colA) select * from cte2""",
"postgres"
), indent=4))
This results in the following
{
"file": {
"statement": {
"with_compound_statement": [
{
"keyword": "with"
},
{
"whitespace": " "
},
{
"common_table_expression": [
{
"naked_identifier": "cte1"
},
{
"whitespace": " "
},
{
"keyword": "as"
},
{
"whitespace": " "
},
{
"bracketed": {
"start_bracket": "(",
"select_statement": {
"select_clause": {
"keyword": "select",
"whitespace": " ",
"select_clause_element": {
"column_reference": {
"naked_identifier": "colA"
}
}
},
"whitespace": " ",
"from_clause": {
"keyword": "from",
"whitespace": " ",
"from_expression": {
"from_expression_element": {
"table_expression": {
"table_reference": {
"naked_identifier": "table1"
}
}
}
}
}
},
"end_bracket": ")"
}
}
]
},
{
"comma": ","
},
{
"whitespace": " "
},
{
"common_table_expression": [
{
"naked_identifier": "cte2"
},
{
"whitespace": " "
},
{
"keyword": "as"
},
{
"whitespace": " "
},
{
"bracketed": {
"start_bracket": "(",
"select_statement": {
"select_clause": {
"keyword": "select",
"whitespace": " ",
"select_clause_element": {
"column_reference": {
"naked_identifier": "colB"
}
}
},
"whitespace": " ",
"from_clause": {
"keyword": "from",
"whitespace": " ",
"from_expression": {
"from_expression_element": {
"table_expression": {
"table_reference": {
"naked_identifier": "table2"
}
}
},
"whitespace": " ",
"join_clause": [
{
"keyword": "inner"
},
{
"whitespace": " "
},
{
"keyword": "join"
},
{
"whitespace": " "
},
{
"from_expression_element": {
"table_expression": {
"table_reference": {
"naked_identifier": "table1"
}
}
}
},
{
"whitespace": " "
},
{
"join_on_condition": {
"keyword": "on",
"whitespace": " ",
"expression": [
{
"column_reference": [
{
"naked_identifier": "table2"
},
{
"dot": "."
},
{
"naked_identifier": "colB"
}
]
},
{
"whitespace": " "
},
{
"comparison_operator": {
"raw_comparison_operator": "="
}
},
{
"whitespace": " "
},
{
"column_reference": [
{
"naked_identifier": "table1"
},
{
"dot": "."
},
{
"naked_identifier": "colA"
}
]
}
]
}
}
]
}
}
},
"end_bracket": ")"
}
}
]
},
{
"whitespace": " "
},
{
"select_statement": {
"select_clause": {
"keyword": "select",
"whitespace": " ",
"select_clause_element": {
"wildcard_expression": {
"wildcard_identifier": {
"star": "*"
}
}
}
},
"whitespace": " ",
"from_clause": {
"keyword": "from",
"whitespace": " ",
"from_expression": {
"from_expression_element": {
"table_expression": {
"table_reference": {
"naked_identifier": "cte2"
}
}
}
}
}
}
}
]
}
}
}
Is there a way to reconstrunct the query string given one of these dicts? I couldn't find anything on the official API reference.
I asked the same question on sqlfluff's Slack and got an answer.
To achieve this you need more than just the simple API. You instance a Lexer and pass its results to a Parser object. Then, when calling Parser().parse(lex_result)
you get a FileSegment
which is composed by Segments that are also composed by Segments. You iter through them via .segments
attribute.
Each Segment can return to string via the .raw
attribute.
from sqlfluff.core import Parser, Lexer
lexer = Lexer(dialect="postgres")
parser = Parser(dialect="postgres")
tokens, _ = lexer.lex(
"""with cte1 as (select colA from table1), cte2 as (select colB from table2) select * from cte1"""
)
obj = parser.parse(tokens)
print(obj.segments)
print(obj.segments[0].raw)
Output:
>> (<StatementSegment: ([L: 1, P: 1])>, <EndOfFile: ([L: 1, P: 93]) ''>)
>> with cte1 as (select colA from table1), cte2 as (select colB from table2) select * from cte1