I needed to create a subroutine that loops through a collection of outlook messages, opens the attachments and extracts any tabular data in a zip folder to a pandas data frame. To grab the tabular data, I created a function called zip_to_dfs that accepts an outlook MailItem attachment as the argument).
#function to extract tabluar data within zip file to pandas dataframe. returns dictionary object(key=filename;value=pandas df)
import pandas as pd, zipfile, tempfile, os
def zip_to_dfs(attachment, extract_fn=None):
#returns diciontary object with filename for key and dataframes from attached files as values
df_objects = {}
tmp=tempfile.TemporaryFile().name
attachment.SaveAsFile(tmp)
if zipfile.is_zipfile(tmp)==True:
zf = zipfile.ZipFile(tmp)
#below subroutine could be made to separate function (read tablular to df) to make more readable
for file in zf.infolist():
extension = os.path.splitext(file.filename)[1]
if extension in ['.xls','.xlsx','.xlsm']:
temp_df = pd.read_excel(zf.open(file.filename), header=None)
df_objects.update({file.filename:temp_df})
elif file.filename.endswith(".csv"):
temp_df = pd.read_csv(zf.open(file.filename), header=None)
df_objects.update({file.filename:temp_df})
else:
raise NotImplementedError('Unexpected filetype: '+str(file.filename))
else:
raise NotImplementedError('Expected zip file')
return(df_objects)
The function works as intended, but it's probably inefficient. Has anyone used the tempfile or zip file libraries? If so, do you know if Zipfile and TemporaryFile methods clean up automatically? Or are these files being left open on the disk? Do you seen any other obvious issues with this approach?
Edited Code Revision:
def zipattach_to_dfs(attachment, extract_fn=None):
#evaluates zip file attachments and returns dictionary with file name as key and dataframes as values
df_objects = {}
with NamedTemporaryFile(suffix='.tmp', delete=False) as tmp:
attachment.SaveAsFile(tmp.name)
zf = ZipFile(tmp)
for file in zf.infolist():
datetime = (file.date_time)
key = (f'{file.filename}({datetime[0]}-{datetime[1]}-{datetime[2]})')
if isexcel(file) ==True:
temp_df = pd.read_excel(zf.open(file.filename), header=None)
df_objects.update({key:temp_df})
elif file.filename.endswith(".csv"):
temp_df = pd.read_csv(zf.open(file.filename), header=None)
df_objects.update({key:temp_df})
else:
raise NotImplementedError('Unexpected filetype: '+str(file.filename))
return (df_objects)
ZipFile
supports the with statement too. So, here is my suggestion based on your code(s) :
def zip_to_dfs(attachment, extract_fn=None): # extract_fn ?
'''
Returns a dictionary object with filename for key
and dataframes from attached files as values.
'''
with NamedTemporaryFile(delete=False) as tmp:
attachment.SaveAsFile(tmp.name)
if is_zipfile(tmp):
with ZipFile(tmp) as zf:
for file in zf.infolist():
fn, exte = file.filename.rsplit(".", 1)
key = (f'{fn} ({"-".join(map(str, file.date_time[:3]))})')
if exte in {'xls', 'xlsx', 'xlsm', 'csv'}:
df_objects = {}
with zf.open(file) as zip_file:
if exte == 'csv':
df = pd.read_csv(zip_file, header=None)
else:
df = pd.read_excel(zip_file, header=None)
df_objects[key] = df
else:
raise NotImplementedError(
'Unexpected filetype: ' + file.filename
)
return df_objects
Calling this function would look like something below :
from win32com.client import Dispatch
outlook = Dispatch("Outlook.Application").GetNamespace("MAPI")
folder = outlook.Folders("foo@bar.qux").Folders("Inbox")
out = pd.concat(
[v.assign(date=k) for item in folder.Items for att in item.Attachments
if zip_to_dfs(att) for k,v in zip_to_dfs(att).items()
]
) # this will consolidate all the dfs in a single one