[SOLVED] Extract consecutive two pages from a pdf document and save each file with a text from each first page as the filenames

Extract consecutive two pages from a pdf document and save each file with a text from each first page as the filenames

I have a 100 page pdf document. Each two pages contain unique employee data. I need a python code to extract each of the two pages and save them as separate files with filenames as the text extracted from each first page. For example

The 100 page pdf document will be saved at 50 separate files
The first page of each file contains the text Dear Miles Wood, Dear Kate Aaron etc,
The first extracted filename should be Miles_Wood.pdf and second Kate_Aaron.pdf and so on..

Will be most pleased with a python solution

Thanks in advance

I have tried to adapt a seemingly similar python solution by the following but it doesn't appear to work for me

from PyPDF2 import PdfReader, PdfWriter
import re
import argparse
import os
 
cwd = os.getcwd()
output_dir = os.path.join(cwd, 'output')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
 
def get_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument("-p", "--pdf", dest="pdf", required=True)
    parser.add_argument("-c", "--count", dest="count", type=int, required=True)
    parser.add_argument("-r", "--regex", dest="regex", required=True)
    return parser.parse_args()
 
def split_pdf(file, page_count, regex):
    reader = PdfReader(file)
    os.chdir(output_dir)
    for i in range(0, reader.numPages, page_count):
        writer = PdfWriter()
        if reader.numPages > 1 :
            for y in range(page_count):
                writer.add_page(reader.pages[i])
                writer.add_page(reader.pages[i + y])
        else :
            writer.add_page(reader.pages[i])
        text = reader.pages[i].extract_text()
        search = re.search(regex, text)
        newname = search.group(1) + ".pdf"
        outputStream = open(newname, "wb")
        writer.write(outputStream)
 
 
if __name__ == "__main__" :
    arguments = get_arguments()
    split_pdf(arguments.pdf, arguments.count, arguments.regex)](https://stackoverflow.com)

Credit https://pastebin.com/mDRV77pp

Solution

A solution based on PyMuPDF:

import fitz  # PyMuPDF

doc = fitz.open("input.pdf")
i = 0
while i < len(doc):
    page = doc[i]
    words = page.get_text("words", sort=True)
    for k, word in enumerate(words):
        if word[4] != "Dear":
            continue
        j = k + 2  # 1st name (skipped "Mr.", "Mrs." etc.)
        # append name components until one ends with a comma
        names = [words[j][4]]
        while not words[j][4].endswith(","):
            j += 1
            names.append(words[j][4])
        filename = "-".join(names)[:-1] + ".pdf"
    new = fitz.open()
    new.insert_pdf(doc, from_page=i, to_page=i + 1)
    new.ez_save(filename)
    new.close()
    i += 2

Whole thing works on the assumption, that the greeting contains "Dear", then a Mr./Mrs./Ms. or whatever honorifics, then however number of names the last one ending with a comma. We are extracting the words of the odd pages in word format and sort them (just to be sure) by y, then by x coordinate.