I have a 100 page pdf document. Each two pages contain unique employee data. I need a python code to extract each of the two pages and save them as separate files with filenames as the text extracted from each first page. For example
Will be most pleased with a python solution
Thanks in advance
I have tried to adapt a seemingly similar python solution by the following but it doesn't appear to work for me
from PyPDF2 import PdfReader, PdfWriter
import re
import argparse
import os
cwd = os.getcwd()
output_dir = os.path.join(cwd, 'output')
if not os.path.exists(output_dir):
os.makedirs(output_dir)
def get_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--pdf", dest="pdf", required=True)
parser.add_argument("-c", "--count", dest="count", type=int, required=True)
parser.add_argument("-r", "--regex", dest="regex", required=True)
return parser.parse_args()
def split_pdf(file, page_count, regex):
reader = PdfReader(file)
os.chdir(output_dir)
for i in range(0, reader.numPages, page_count):
writer = PdfWriter()
if reader.numPages > 1 :
for y in range(page_count):
writer.add_page(reader.pages[i])
writer.add_page(reader.pages[i + y])
else :
writer.add_page(reader.pages[i])
text = reader.pages[i].extract_text()
search = re.search(regex, text)
newname = search.group(1) + ".pdf"
outputStream = open(newname, "wb")
writer.write(outputStream)
if __name__ == "__main__" :
arguments = get_arguments()
split_pdf(arguments.pdf, arguments.count, arguments.regex)](https://stackoverflow.com)
Credit https://pastebin.com/mDRV77pp
A solution based on PyMuPDF:
import fitz # PyMuPDF
doc = fitz.open("input.pdf")
i = 0
while i < len(doc):
page = doc[i]
words = page.get_text("words", sort=True)
for k, word in enumerate(words):
if word[4] != "Dear":
continue
j = k + 2 # 1st name (skipped "Mr.", "Mrs." etc.)
# append name components until one ends with a comma
names = [words[j][4]]
while not words[j][4].endswith(","):
j += 1
names.append(words[j][4])
filename = "-".join(names)[:-1] + ".pdf"
new = fitz.open()
new.insert_pdf(doc, from_page=i, to_page=i + 1)
new.ez_save(filename)
new.close()
i += 2
Whole thing works on the assumption, that the greeting contains "Dear", then a Mr./Mrs./Ms. or whatever honorifics, then however number of names the last one ending with a comma. We are extracting the words of the odd pages in word format and sort them (just to be sure) by y, then by x coordinate.