pythonpandaslistdataframefilelist

Pandas store filenames in list and read all the files into a dataframe


Hi I am trying to search for specified filenames using glob and re. Once the match is found i am storing the list of matched files into a text file. What i need is that i want to fetch all these matched file names into a list so that i can store them into a single dataframe .

import re
import sys
import os
import glob
import pandas as pd
import logging

with open('files_matched.txt', 'w') as f_matched, open('files_not_matched.txt','w') as f_notmatched:
    try:
        for file in glob.glob('*.csv'):
                r = re.search(r'abc_sales(20[0-9][0-9])-([1-9]|1[0-2]|0[0-9])-([1-9]|1[0-9]|2[0-9]|3[0-1]|0[0-9])-[0-9]{2}_[a-z0-9]{3,5}.csv', file)
                if r:
                    filename=[file] # unable to store list of files 
                    match=f'File matched:{file}'
                    f_matched.write(match+'\n')
                else:
                    not_match=f'File not matched:{file}'
                    f_notmatched.write(not_match + '\n')
    except Exception as e:
        pass

df=[pd.read_csv(f,encoding='ISO-8859-1',error_bad_lines=False,engine='python') for f in filename]

for df_new, f in zip(df,filename):
      df_new['f'] = f
combined_df = pd.concat(df, ignore_index=False)
combined_df.head()

Solution

  • Create new list of files by append and then create new column by DataFrame.assign:

    filenames = []
    with open('files_matched.txt', 'w') as f_matched, open('files_not_matched.txt','w') as f_notmatched:
        try:
            for file in glob.glob('*.csv'):
                    r = re.search(r'abc_sales(20[0-9][0-9])-([1-9]|1[0-2]|0[0-9])-([1-9]|1[0-9]|2[0-9]|3[0-1]|0[0-9])-[0-9]{2}_[a-z0-9]{3,5}.csv', file)
                    if r:
                        filenames.append(file) # add filename to list 
                        match=f'File matched:{file}'
                        f_matched.write(match+'\n')
                    else:
                        not_match=f'File not matched:{file}'
                        f_notmatched.write(not_match + '\n')
        except Exception as e:
            pass
    
    #add new column by assign
    df=[pd.read_csv(f,
                    encoding='ISO-8859-1',
                    error_bad_lines=False,
                    engine='python').assign(f = f) 
        for f in filenames]
    
    combined_df = pd.concat(df, ignore_index=False)