pythonsqlgoogle-bigquerysql-parser

Parse Google BQ SQL queries and get all tables referenced


In a console Google Cloud project I have several datasets with tables/views in them, so most of them have the query from which they came. I am trying to parse every single one of them to get their table dependencies. With "table dependencies" I mean the tables next to their FROM statements or JOIN statements, not returning aliases or whole subqueries/ SELECT statements.

I am using this code:

from google.cloud import bigquery
import sqlparse
from sqlparse.sql import IdentifierList, Identifier, Parenthesis, Token
from sqlparse.tokens import Keyword, DML

def extract_tables(parsed):
    tables = set()
    from_seen = False

    for token in parsed.tokens:
        if from_seen:
            if isinstance(token, IdentifierList):
                for identifier in token.get_identifiers():
                    tables.add(identifier.get_real_name())
            elif isinstance(token, Identifier):
                tables.add(token.get_real_name())
            elif isinstance(token, Parenthesis):
                subquery = extract_tables(token)
                tables.update(subquery)
            elif token.ttype is Keyword and token.value.upper() in ('WHERE', 'GROUP BY', 'HAVING', 'ORDER BY', 'LIMIT', 'UNION', 'EXCEPT', 'INTERSECT'):
                from_seen = False
                continue
        if token.ttype is Keyword and token.value.upper() in ('FROM', 'JOIN'):
            from_seen = True

    return tables

def extract_table_names(sql):
    tables = set()
    parsed = sqlparse.parse(sql)
    for stmt in parsed:
        if stmt.get_type() == 'SELECT':
            tables.update(extract_tables(stmt))
    return tables

def get_table_query(client, dataset_id, table_id):
    table = client.get_table(f"{dataset_id}.{table_id}")
    if table.table_type == 'VIEW':
        return table.view_query
    return None

def list_tables_and_sources(project_id):
    client = bigquery.Client(project=project_id)
    datasets = list(client.list_datasets())
    table_sources = {}

    for dataset in datasets:
        dataset_id = dataset.dataset_id
        tables = list(client.list_tables(dataset_id))
        
        for table in tables:
            table_id = table.table_id
            query = get_table_query(client, dataset_id, table_id)
            if query:
                source_tables = extract_table_names(query)
                table_key = f"{dataset_id}.{table_id}"
                table_sources[table_key] = list(source_tables)  # Convert set to list

    return table_sources

project_id = 'my-project-id'
table_sources = list_tables_and_sources(project_id)

import json
print(json.dumps(table_sources, indent=4))

I want my output to be in that schema {"my-project-id.reported_data.orders" : {dataset.tableName, dataset.TableName}}.

Most tables are being parsed fine outputting exactly what it should be but some tables/queries are skipped entirely getting a result of an empty array.

For example, I have this view

SELECT  
barcode, 
CAST(CONCAT(min(EXTRACT(YEAR FROM active_date)), '-', min(EXTRACT(MONTH FROM active_date)), '-01') as DATE) as first_of_month,
count(active_date) as active_days,
FROM `my-project-id.erp_raw_data.fsn_lines`
GROUP BY barcode, (EXTRACT(YEAR FROM active_date)), EXTRACT(MONTH FROM active_date)
ORDER BY barcode, first_of_month;

But my output looks like this

{"erp_formatted_data.active_days": [],...}

Another view/query that after parsing returns an empty array is that one

SELECT
  date,
  barcode,
  calendar_month,
  calendar_year,
  quarter,
  image_url,
  sold_quantity,
  number_of_sales,
  number_of_invoices,
  invoiced_quantity,
  invoiced_revenue,
  views,
  impressions,
  supplier_name,
  main_category,
  sub_category,
  (number_of_sales / views) as sales_per_views,
  (number_of_sales / impressions) as sales_per_impressions,
  IF(number_of_sales > 0 OR views > 0 OR impressions > 0 OR active_day = 1, 1, 0) as active_day,
FROM
  `my-project-id.erp_formatted_data.daily_barcodes_stats`

I have no idea what the problem could be in my code. Any ideas?


Solution

  • Tried using the sqlparser library and failed multiple times so I switched into using the sql-metadata one. Here's my updated, fully-working code:

    from google.cloud import bigquery
    from sql_metadata import Parser
    import re
    
    def get_bigquery_views(project_id):
        client = bigquery.Client(project=project_id)
        views = []
    
        for dataset in client.list_datasets():
            dataset_ref = client.dataset(dataset.dataset_id)
            for table in client.list_tables(dataset_ref):
                if table.table_type == 'VIEW':
                    view_ref = dataset_ref.table(table.table_id)
                    view = client.get_table(view_ref)
                    views.append((dataset.dataset_id, view.table_id, view.view_query))
    
        return views
    
    def filter_known_functions_and_keywords(tables):
        # List of known BigQuery functions and keywords to exclude
        known_functions_and_keywords = {
            'UNNEST', 'GENERATE_DATE_ARRAY', 'CURRENT_DATE', 'INTERVAL', 'FORMAT_DATE', 
            'EXTRACT', 'SELECT', 'FROM', 'ORDER', 'BY', 'SPLIT', ',', 'AS'
        }
    
        # Regex pattern to match valid table names (e.g., project.dataset.table)
        table_name_pattern = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_.]*$")
    
        # Filter out known functions, keywords, and invalid table names
        filtered_tables = [
            table for table in tables 
            if table.upper() not in known_functions_and_keywords and table_name_pattern.match(table)
        ]
        return filtered_tables
    
    def extract_real_tables(query):
        parser = Parser(query)
        tables = parser.tables
        aliases = parser.tables_aliases
    
        # Exclude tables that are aliases or field names in functions like UNNEST
        real_tables = [
            table for table in tables 
            if table not in aliases and '.' in table
        ]
    
        return filter_known_functions_and_keywords(real_tables)
    
    if __name__ == '__main__':
        project_id = 'my-project-id'
    
        views = get_bigquery_views(project_id)
        results = []
    
        for dataset_id, view_name, view_query in views:
            real_tables = extract_real_tables(view_query)
    
            # Format the key as "project_id.dataset_id.view_name"
            key = f"{project_id}.{dataset_id}.{view_name}"
    
            if real_tables:
                results.append({key: set(real_tables)})
            else:
                results.append({key: "No table/view associated"})
    
        # Print the results in a readable format
        for result in results:
            for view, tables in result.items():
                print(f"{view} : {tables}")