I am trying to extract emails from multiple excel files and get them appended in to a CSV file. The program was working for the last few days. But now it is not creating the CSV file in the output folder. I even tried to make one manually but it deletes the CSV file created manually as soon as i run the code.
here is my program
import os
import re
import pandas as pd
# Regular expression pattern to match email addresses
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
# Function to extract email addresses from a string
def extract_emails(text):
return re.findall(email_pattern, text)
# Function to extract emails from an Excel file
def extract_emails_from_excel(file_path):
email_list = []
try:
df = pd.read_excel(file_path)
for column in df.columns:
for cell in df[column]:
if isinstance(cell, str):
emails = extract_emails(cell)
email_list.extend(emails)
except Exception as e:
print(f"Error processing {file_path}: {e}")
return email_list
# Specify the folder containing Excel files
folder_path = r'E:\1DW\Excel'
# Specify the path for the output CSV file
output_csv_file = r'C:\Users\HAL-2023\Desktop\Py_out\output_emails.csv'
# Ensure the output CSV file is empty or create it if it doesn't exist
if os.path.exists(output_csv_file):
os.remove(output_csv_file)
# Loop through Excel files in the folder
for filename in os.listdir(folder_path):
if filename.endswith('.xlsx') or filename.endswith('.xls'):
input_file_path = os.path.join(folder_path, filename)
email_addresses = extract_emails_from_excel(input_file_path)
# Append extracted email addresses to the CSV file
if email_addresses:
df = pd.DataFrame({'Email Addresses': email_addresses})
df.to_csv(output_csv_file, mode='a', index=False, header=False)
print(f"Extracted email addresses written to {output_csv_file}")
Result
C:\Users\HAL-2023\Desktop\Python\venv\Scripts\python.exe C:\Users\HAL-2023\Desktop\Python\email_from_excel.py
C:\Users\HAL-2023\Desktop\Python\venv\lib\site-packages\openpyxl\styles\stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default
warn("Workbook contains no default style, apply openpyxl's default")
Extracted email addresses written to C:\Users\HAL-2023\Desktop\Py_out\output_emails.csv
Process finished with exit code 0
But there is no file called "output_emails.csv" in that folder.
Your problem seems to be the approach of deleting the output_emails.csv
file at the beginning of the script and then trying to append to it. When you're iterating through multiple Excel files and trying to write emails to the CSV file, if one of those Excel files doesn't have any emails, it means you'll end up deleting the CSV and not recreating it, which can lead to the described behavior.
So instead of deleting the CSV file, you should check if it exists and append to it. If it doesn't exist, create it with a header. You don't need to create a new DataFrame every time you want to append to the CSV file. Instead, you can use the CSV module to append the data.
import os
import re
import pandas as pd
import csv
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
def extract_emails(text):
return re.findall(email_pattern, text)
def extract_emails_from_excel(file_path):
email_list = []
try:
df = pd.read_excel(file_path)
for column in df.columns:
for cell in df[column]:
if isinstance(cell, str):
emails = extract_emails(cell)
email_list.extend(emails)
except Exception as e:
print(f"Error processing {file_path}: {e}")
return email_list
folder_path = r'E:\1DW\Excel'
output_csv_file = r'C:\Users\HAL-2023\Desktop\Py_out\output_emails.csv'
if not os.path.exists(output_csv_file):
with open(output_csv_file, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Email Addresses'])
for filename in os.listdir(folder_path):
if filename.endswith('.xlsx') or filename.endswith('.xls'):
input_file_path = os.path.join(folder_path, filename)
email_addresses = extract_emails_from_excel(input_file_path)
if email_addresses:
with open(output_csv_file, 'a', newline='') as f:
writer = csv.writer(f)
writer.writerows([[email] for email in email_addresses])
print(f"Extracted email addresses written to {output_csv_file}")