pythonpdfextractpypdfpython-pdfreader

Extract consecutive two pages from a pdf document and save each file with a text from each first page as the filenames


I have a 100 page pdf document. Each two pages contain unique employee data. I need a python code to extract each of the two pages and save them as separate files with filenames as the text extracted from each first page. For example

Will be most pleased with a python solution

Thanks in advance

I have tried to adapt a seemingly similar python solution by the following but it doesn't appear to work for me

from PyPDF2 import PdfReader, PdfWriter
import re
import argparse
import os
 
cwd = os.getcwd()
output_dir = os.path.join(cwd, 'output')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
 
def get_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument("-p", "--pdf", dest="pdf", required=True)
    parser.add_argument("-c", "--count", dest="count", type=int, required=True)
    parser.add_argument("-r", "--regex", dest="regex", required=True)
    return parser.parse_args()
 
def split_pdf(file, page_count, regex):
    reader = PdfReader(file)
    os.chdir(output_dir)
    for i in range(0, reader.numPages, page_count):
        writer = PdfWriter()
        if reader.numPages > 1 :
            for y in range(page_count):
                writer.add_page(reader.pages[i])
                writer.add_page(reader.pages[i + y])
        else :
            writer.add_page(reader.pages[i])
        text = reader.pages[i].extract_text()
        search = re.search(regex, text)
        newname = search.group(1) + ".pdf"
        outputStream = open(newname, "wb")
        writer.write(outputStream)
 
 
if __name__ == "__main__" :
    arguments = get_arguments()
    split_pdf(arguments.pdf, arguments.count, arguments.regex)](https://stackoverflow.com)

Credit https://pastebin.com/mDRV77pp

Solution

  • A solution based on PyMuPDF:

    import fitz  # PyMuPDF
    
    doc = fitz.open("input.pdf")
    i = 0
    while i < len(doc):
        page = doc[i]
        words = page.get_text("words", sort=True)
        for k, word in enumerate(words):
            if word[4] != "Dear":
                continue
            j = k + 2  # 1st name (skipped "Mr.", "Mrs." etc.)
            # append name components until one ends with a comma
            names = [words[j][4]]
            while not words[j][4].endswith(","):
                j += 1
                names.append(words[j][4])
            filename = "-".join(names)[:-1] + ".pdf"
        new = fitz.open()
        new.insert_pdf(doc, from_page=i, to_page=i + 1)
        new.ez_save(filename)
        new.close()
        i += 2
    

    Whole thing works on the assumption, that the greeting contains "Dear", then a Mr./Mrs./Ms. or whatever honorifics, then however number of names the last one ending with a comma. We are extracting the words of the odd pages in word format and sort them (just to be sure) by y, then by x coordinate.