I have a .csv with a column called 'Name' listing individuals in Firstname MI. Lastname
format:
And I have a folder full of files made by these individuals in lastnamefirstname
format:
Using the list of names I need to find the files that match and save those files to a separate folder.
What is the best way to do this?
Let's prepare some space to model:
# necessary part
import pandas as pd
from pathlib import Path
from shutil import copy # consider copy2 as alternative if metadata should be copied as well
# for demonstration purpose
from tempfile import TemporaryDirectory
from os import listdir
from io import StringIO
data = '''
Stephen A. Johnson
Stephen B. Johnson
Albert J. Reiß
'''
csv_source = StringIO(data)
source = TemporaryDirectory(); src = Path(source.name)
destin = TemporaryDirectory(); dst = Path(destin.name)
filenames = [
'johnsonstephen_30',
'johnsonstephen_40',
'reissalbert_50', # should be copied with `casefold` and ignored with `lower`
'gibberish_10', # should be ignored
]
for file in filenames:
(src/file).touch()
Now the main part. There's at least one pitfall: casefold
versus lower
when applied to the original names. This may influence on the file names selected and should be consciously resolved on your side. I use casefold
and the name "Reiß"
to demonstrate how it might appear.
# read the sequence of names
person = pd.read_csv(csv_source, header=None).squeeze()
# preparing name stems
ersatz = (
person
.str.casefold() # not identical to `lower`, select one according to naming policy
.str.split()
.str[::-2] # it won't work if the middle name doesn't exist.
.str.join('')
.unique()
)
# copying files of interest
for name in ersatz:
for file in src.rglob(f'{name}_*'):
copy(file, dst)
Check the result and clean up the space:
print(f'Source: {listdir(src)}')
print(f'Destination: {listdir(dst)}')
source.cleanup()
destin.cleanup()
Expected output:
Source: ['gibberish_1234', 'johnsonstephen_40', 'johnsonstephen_30', 'reissalbert_50']
Destination: ['johnsonstephen_40', 'johnsonstephen_30', 'reissalbert_50']
P.S. Alternatively, when loading a csv file, we can interpret whitespace characters as a delimiter:
# read the sequence of names
person = pd.read_csv(
csv_source,
header=None,
delim_whitespace=True,
usecols=[0,2],
names=['firstname','lastname'],
)
ersatz = person['lastname'].str.casefold() + person['firstname'].str.casefold()
from pandas import read_csv
from pathlib import Path
from shutil import copy
# declare csv-file and source/destination folders
source = 'path/to/folder/with/files'
destination = 'folder/to/copy/files/into'
csv_source = 'path/to/comma_separated_data_with_column_Name.csv'
# main script
src = Path(source)
dst = Path(destination)
person = read_csv(csv_source)['Name'].squeeze()
ersatz = (
person
.str.casefold()
.str.split()
.pipe(lambda seq: seq.str[-1] + seq.str[0]) # rely only on first and last name
.unique()
)
for name in ersatz:
for file in src.rglob(f'{name}_*'):
copy(file, dst)