I'm creating a function that get a list of all the global dataframes in memory and does something with them. Example:
df1 = pd.DataFrame()
df2 = pd.DataFrame()
def func():
# Get all dataframes in memory
alldfs = [var for var in dir() if isinstance(eval(var), pd.core.frame.DataFrame)]
# do something
return dict(zip(tables_filtered, [ps.from_pandas(globals()[table]) for table in tables_filtered]))
func() # returns df1 and df2
df3 = pd.DataFrame()
func() # returns df1 and df2
The issue is that the global variables at the time of creation are stored and used everytime I run the function, but I want it to read the most up to date globals everytime its ran. So if 5 more dataframes are created after, it will return those.
Originally, I was trying to write the function in another module and import it, but the globals wasn't working so now doing it in the same notebook but having the same issue.
Okay to add more context since it seems like this is a bad approach. My goal was to pack the pyspark.sql feature into a function so that others using the notebook could easily just write sql queries on the dataframes:
def get_all_dataframes():
return = [name for name,var in globals().items() if isinstance(var, pd.core.frame.DataFrame)]
def query_data(query):
# Gather all dataframes saved in memory
dataframes = get_all_dataframes()
# Find all table names after FROM or JOIN
tables = re.findall(r'FROM\s+(\w+)|JOIN\s+(\w+)', query, re.IGNORECASE)
tables = [table for sublist in tables for table in sublist if table]
tables_found = []
# Check if tables exist in memory and replace them with curly brackets
for table in tables:
if table in dataframes:
query = re.sub(r'\b' + table + r'\b', f'{{{table}}}', query)
if table not in tables_found:
tables_found.append(table)
else:
raise ValueError(f"Table '{table}' does not exist in memory.")
# Create the query args and turn Pandas DF into a Pyspark DF
print(f"Tables detected: {tables_found}")
query_args = dict(zip(tables_found, [ps.from_pandas(globals()[table]) for table in tables_found]))
def _run_query(**query_args):
return ps.sql(
query,
**query_args
).to_pandas()
query_results = _run_query(**query_args)
print(f"Table created with {query_results.shape[0]:,} rows and {query_results.shape[1]:,} columns.")
return query_results
So that way, instead of having to write this:
ps.sql('''
SELECT m1.a, m2.b
FROM {table1} m1 INNER JOIN {table2} m2
ON m1.key = m2.key
ORDER BY m1.a, m2.b''',
table1=ps.from_pandas(table1),
table2=ps.from_pandas(table2))
You can just write this:
run_query(
"""
SELECT m1.a, m2.b
FROM table1 m1 INNER JOIN table2 m2
ON m1.key = m2.key
ORDER BY m1.a, m2.b
"""
)
This is a simple example. The queries are much more complex. And the reason for the sql is that I'm transitioning a lot of code to Python and a lot of the transformations were using PROC SQL (SAS). So in order for the end user to be able to make changes a bit easier.
Use globals()
import pandas as pd
df1 = pd.DataFrame()
df2 = pd.DataFrame()
def func():
# Get all dataframes in memory
alldfs = [name for name,var in globals().items() if isinstance(var, pd.core.frame.DataFrame)]
# do something
return alldfs
print(func()) # returns df1 and df2
df3 = pd.DataFrame()
print(func())
This outputs
$ python myfile.py
['df1', 'df2']
['df1', 'df2', 'df3']
This clearly contains df3
, the newly created df, which is also in global scope