I have PDF files in same folder. How to get all PDF file names and save as excel file according to PDF file name. This is what I have tried
def get_files(pdf_path):
import os
os.chdir(pdf_path)
files = os.listdir()
files = [x for x in files if x.endswith(".pdf")]
return files
files = get_files(pdf_path)
for i in files:
save_as_excel(pdf_path, i)
As discussed on chat, this is the continuation of your previous question, which I answered. In the previous question I answered how you can extract text from the pdf file which contains multiple data entity. Now you want to extract the text and parse the content to save the data as csv/xlsx
for all pdf files present in the folder.
Please go through all the steps below, all you need to change below is the path of your directory to pdf files path_of_pdf_files
Assumption and logic would remain same from my previous answer.
I have moved the data and methods and encapsulated to a class PdfExtractor
.
Please follow the below steps to extract text from pdf and save as xlsx.
Before moving ahead install the packages pdfplumber, xlsxwriter
PdfExtractor.py
import pdfplumber
import xlsxwriter
import re
# regex pattern for keys in line1 of data entity
my_regex_dict_line1 = {
'Our Ref' : r'Our Ref :(.*?)Name',
'Name' : r'Name:(.*?)Ref 1',
'Ref 1' : r'Ref 1 :(.*?)Ref 2',
'Ref 2' : r'Ref 2:(.*?)$'
}
# regex pattern for keys in line2 of data entity
my_regex_dict_line2 = {
'Amount' : r'Amount:(.*?)Total Paid',
'Total Paid' : r'Total Paid:(.*?)Balance',
'Balance' : r'Balance:(.*?)Date of A/C',
'Date of A/C' : r'Date of A/C:(.*?)Date Received',
'Date Received' : r'Date Received:(.*?)$'
}
# regex pattern for keys in line3 of data entity
my_regex_dict_line3 ={
'Last Paid' : r'Last Paid:(.*?)Amt Last Paid',
'Amt Last Paid' : r'Amt Last Paid:(.*?)A/C\s+Status',
'A/C Status': r'A/C\s+Status:(.*?)Collector',
'Collector' : r'Collector :(.*?)$'
}
class PdfExtractor:
data_entity_sep_pattern = r'(?=Our Ref.*?Name.*?Ref 1.*?Ref 2)'
def __init__(self, pdf_path):
self.pdf_path = pdf_path
self.json_data = {}
self.pdf_text = ''
def __preprocess_data(self, data):
return [el.strip() for el in data.splitlines() if el.strip()]
def __get_header_data(self, text):
header_data_list = self.__preprocess_data(text)
# third line in text of header contains Date Created field
self.json_data['Date Created'] = re.search(r'Date Created:(.*?)$', header_data_list[2]).group(1).strip()
# fourth line in text contains Number of Pages, Client Code, Client Name
self.json_data['Number of Pages'] = re.search(r'Number of Pages:(.*?)$', header_data_list[3]).group(1).strip()
# fifth line in text contains Client Code and ClientName
self.json_data['Client Code'] = re.search(r'Client Code - (.*?)Client Name', header_data_list[4]).group(1).strip()
self.json_data['ClientName'] = re.search(r'Client Name - (.*?)$', header_data_list[4]).group(1).strip()
def __iterate_through_regex_and_populate_dictionaries(self, data_dict, regex_dict, text):
''' For the given pattern of regex_dict, this function iterates through each regex pattern and adds the key value to regex_dict dictionary '''
for key, regex in regex_dict.items():
matched_value = re.search(regex, text)
if matched_value is not None:
data_dict[key] = matched_value.group(1).strip()
def __populate_date_notes(self, data_dict, text):
''' This function populates date and Notes in the data chunk in the form of list to data_dict dictionary '''
data_dict['Date'] = []
data_dict['Notes'] = []
iter = 4
while(iter < len(text)):
date_match = re.search(r'(\d{2}/\d{2}/\d{4})',text[iter])
data_dict['Date'].append(date_match.group(1).strip())
notes_match = re.search(r'\d{2}/\d{2}/\d{4}\s*(.*?)$',text[iter])
data_dict['Notes'].append(notes_match.group(1).strip())
iter += 1
def get_pdf_text(self):
data_index = 1
with pdfplumber.open(self.pdf_path) as pdf:
index = 0
while(index < len(pdf.pages)):
page = pdf.pages[index]
self.pdf_text += '\n' + page.extract_text()
index += 1
split_on_data_entity = re.split(self.data_entity_sep_pattern, self.pdf_text.strip())
# first data in the split_on_data_entity list will contain the header information
self.__get_header_data(split_on_data_entity[0])
while(data_index < len(split_on_data_entity)):
data_entity = {}
data_processed = self.__preprocess_data(split_on_data_entity[data_index])
self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line1, data_processed[0])
self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line2, data_processed[1])
self.__iterate_through_regex_and_populate_dictionaries(data_entity, my_regex_dict_line3, data_processed[2])
if(len(data_processed) > 3 and data_processed[3] != None and 'Date' in data_processed[3] and 'Notes' in data_processed[3]):
self.__populate_date_notes(data_entity, data_processed)
self.json_data['data_entity' + str(data_index)] = data_entity
data_index += 1
return self.json_data
def save_as_xlsx(self, file_name):
if(not self.json_data):
print("Data was not read from PDF")
return
workbook = xlsxwriter.Workbook(file_name)
worksheet = workbook.add_worksheet("Sheet 1")
row = 0
col = 0
# write column
columns = ['Account History Report', 'All Notes'] + [ key for key in self.json_data.keys() if 'data_entity' not in key ] + list(self.json_data['data_entity1'].keys())
worksheet.write_row(row, col, tuple(columns))
row += 1
column_index_map = {}
for index, col in enumerate(columns):
column_index_map[col] = index
# write the header
worksheet.write(row, column_index_map['Date Created'], self.json_data['Date Created'])
worksheet.write(row, column_index_map['Number of Pages'], self.json_data['Number of Pages'])
worksheet.write(row, column_index_map['Client Code'], self.json_data['Client Code'])
worksheet.write(row, column_index_map['ClientName'], self.json_data['ClientName'])
data_entity_index = 1
#iterate through each data entity and for each key insert the values in the sheet
while True:
data_entity_key = 'data_entity' + str(data_entity_index)
row_size = 1
if(self.json_data.get(data_entity_key) != None):
for key, value in self.json_data.get(data_entity_key).items():
if(type(value) == list):
worksheet.write_column(row, column_index_map[key], tuple(value))
row_size = len(value)
else:
worksheet.write(row, column_index_map[key], value)
else:
break
data_entity_index += 1
row += row_size
workbook.close()
print(file_name + " saved successfully")
path_of_pdf_files
and saves the data in a xlsx file in the same directory. Also note that the below code should be executed in the same folder where you saved the file PdfExtractor.py
import os
from PdfExtractor import PdfExtractor
path_of_pdf_files = r'C:\Users\hpoddar\Desktop\Temp' # Directory path for your pdf files
files = os.listdir(path_of_pdf_files)
for file in files:
if(not file.endswith(".pdf")):
continue
filename = os.path.splitext(file)[0]
pdf_obj = PdfExtractor(os.path.join(path_of_pdf_files, file))
pdf_text = pdf_obj.get_pdf_text()
pdf_obj.save_as_xlsx(os.path.join(path_of_pdf_files, filename + '.xlsx'))
Output :
C:\Users\hpoddar\Desktop\Temp\sample.xlsx saved successfully
C:\Users\hpoddar\Desktop\Temp\sample2.xlsx saved successfully
C:\Users\hpoddar\Desktop\Temp\sample3.xlsx saved successfully
Lets say you have following pdf files in the directory sample.pdf, sample2.pdf, sample3.pdf
. The xlsx files will be created in the same folder with following filename sample.xlsx, sample2.xlsx, sample3.xlsx
Let me know if you have any doubts in the above code.