In a console Google Cloud project I have several datasets with tables/views in them, so most of them have the query from which they came. I am trying to parse every single one of them to get their table dependencies. With "table dependencies" I mean the tables next to their FROM statements or JOIN statements, not returning aliases or whole subqueries/ SELECT statements.
I am using this code:
from google.cloud import bigquery
import sqlparse
from sqlparse.sql import IdentifierList, Identifier, Parenthesis, Token
from sqlparse.tokens import Keyword, DML
def extract_tables(parsed):
tables = set()
from_seen = False
for token in parsed.tokens:
if from_seen:
if isinstance(token, IdentifierList):
for identifier in token.get_identifiers():
tables.add(identifier.get_real_name())
elif isinstance(token, Identifier):
tables.add(token.get_real_name())
elif isinstance(token, Parenthesis):
subquery = extract_tables(token)
tables.update(subquery)
elif token.ttype is Keyword and token.value.upper() in ('WHERE', 'GROUP BY', 'HAVING', 'ORDER BY', 'LIMIT', 'UNION', 'EXCEPT', 'INTERSECT'):
from_seen = False
continue
if token.ttype is Keyword and token.value.upper() in ('FROM', 'JOIN'):
from_seen = True
return tables
def extract_table_names(sql):
tables = set()
parsed = sqlparse.parse(sql)
for stmt in parsed:
if stmt.get_type() == 'SELECT':
tables.update(extract_tables(stmt))
return tables
def get_table_query(client, dataset_id, table_id):
table = client.get_table(f"{dataset_id}.{table_id}")
if table.table_type == 'VIEW':
return table.view_query
return None
def list_tables_and_sources(project_id):
client = bigquery.Client(project=project_id)
datasets = list(client.list_datasets())
table_sources = {}
for dataset in datasets:
dataset_id = dataset.dataset_id
tables = list(client.list_tables(dataset_id))
for table in tables:
table_id = table.table_id
query = get_table_query(client, dataset_id, table_id)
if query:
source_tables = extract_table_names(query)
table_key = f"{dataset_id}.{table_id}"
table_sources[table_key] = list(source_tables) # Convert set to list
return table_sources
project_id = 'my-project-id'
table_sources = list_tables_and_sources(project_id)
import json
print(json.dumps(table_sources, indent=4))
I want my output to be in that schema {"my-project-id.reported_data.orders" : {dataset.tableName, dataset.TableName}}
.
Most tables are being parsed fine outputting exactly what it should be but some tables/queries are skipped entirely getting a result of an empty array.
For example, I have this view
SELECT
barcode,
CAST(CONCAT(min(EXTRACT(YEAR FROM active_date)), '-', min(EXTRACT(MONTH FROM active_date)), '-01') as DATE) as first_of_month,
count(active_date) as active_days,
FROM `my-project-id.erp_raw_data.fsn_lines`
GROUP BY barcode, (EXTRACT(YEAR FROM active_date)), EXTRACT(MONTH FROM active_date)
ORDER BY barcode, first_of_month;
But my output looks like this
{"erp_formatted_data.active_days": [],...}
Another view/query that after parsing returns an empty array is that one
SELECT
date,
barcode,
calendar_month,
calendar_year,
quarter,
image_url,
sold_quantity,
number_of_sales,
number_of_invoices,
invoiced_quantity,
invoiced_revenue,
views,
impressions,
supplier_name,
main_category,
sub_category,
(number_of_sales / views) as sales_per_views,
(number_of_sales / impressions) as sales_per_impressions,
IF(number_of_sales > 0 OR views > 0 OR impressions > 0 OR active_day = 1, 1, 0) as active_day,
FROM
`my-project-id.erp_formatted_data.daily_barcodes_stats`
I have no idea what the problem could be in my code. Any ideas?
Tried using the sqlparser library and failed multiple times so I switched into using the sql-metadata one. Here's my updated, fully-working code:
from google.cloud import bigquery
from sql_metadata import Parser
import re
def get_bigquery_views(project_id):
client = bigquery.Client(project=project_id)
views = []
for dataset in client.list_datasets():
dataset_ref = client.dataset(dataset.dataset_id)
for table in client.list_tables(dataset_ref):
if table.table_type == 'VIEW':
view_ref = dataset_ref.table(table.table_id)
view = client.get_table(view_ref)
views.append((dataset.dataset_id, view.table_id, view.view_query))
return views
def filter_known_functions_and_keywords(tables):
# List of known BigQuery functions and keywords to exclude
known_functions_and_keywords = {
'UNNEST', 'GENERATE_DATE_ARRAY', 'CURRENT_DATE', 'INTERVAL', 'FORMAT_DATE',
'EXTRACT', 'SELECT', 'FROM', 'ORDER', 'BY', 'SPLIT', ',', 'AS'
}
# Regex pattern to match valid table names (e.g., project.dataset.table)
table_name_pattern = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_.]*$")
# Filter out known functions, keywords, and invalid table names
filtered_tables = [
table for table in tables
if table.upper() not in known_functions_and_keywords and table_name_pattern.match(table)
]
return filtered_tables
def extract_real_tables(query):
parser = Parser(query)
tables = parser.tables
aliases = parser.tables_aliases
# Exclude tables that are aliases or field names in functions like UNNEST
real_tables = [
table for table in tables
if table not in aliases and '.' in table
]
return filter_known_functions_and_keywords(real_tables)
if __name__ == '__main__':
project_id = 'my-project-id'
views = get_bigquery_views(project_id)
results = []
for dataset_id, view_name, view_query in views:
real_tables = extract_real_tables(view_query)
# Format the key as "project_id.dataset_id.view_name"
key = f"{project_id}.{dataset_id}.{view_name}"
if real_tables:
results.append({key: set(real_tables)})
else:
results.append({key: "No table/view associated"})
# Print the results in a readable format
for result in results:
for view, tables in result.items():
print(f"{view} : {tables}")