pythonpdftextadobe-reader

Using Adobe Readers Export as text function in python


I want to convert lots of PDFs into text files. The formatting is very important and only Adobe Reader seems to get it right (PDFMiner or PyPDF2 do not.)

Is there a way to automate the "export as text" function from Adobe Reader?


Solution

  • The following code will do what you want for one file. I recommend organizing the script into a few little functions and then calling the functions in a loop to process many files. You'll need to install the keyboard library using pip, or some other tool.

    import pathlib as pl
    import os
    import keyboard
    import time
    import io
    
    
    KILL_KEY = 'esc'
    read_path  = pl.Path("C:/Users/Sam/Downloads/WS-1401-IP.pdf")
    ####################################################################
    
    
    write_path = pl.Path(str(read_path.parent/read_path.stem) + ".txt")
    overwrite_file = os.path.exists(write_path)
    
    # alt      -- activate keyboard shortcuts
    # `F`      -- open file menu
    # `v`      -- select "save as text" option
    # keyboard.write(write_path)
    # `alt+s`  -- save button
    # `ctrl+w` -- close file
    
    
    os.startfile(read_path)
    time.sleep(1)
    keyboard.press_and_release('alt')
    time.sleep(1)
    keyboard.press_and_release('f') # -- open file menu
    time.sleep(1)
    keyboard.press_and_release('v') # -- select "save as text" option
    time.sleep(1)
    keyboard.write(str(write_path))
    time.sleep(1)
    keyboard.press_and_release('alt+s')
    time.sleep(2)
    if overwrite_file:
        keyboard.press_and_release('y')
    
    # wait for program to finish saving
    waited_too_long = True
    for _ in range(5):
        time.sleep(1)
        if os.path.exists(write_path):
            waited_too_long = False
            break
    
    if waited_too_long:
        with io.StringIO() as ss:
            print(
                "program probably saved to somewhere other than",
                write_path,
                file = ss
            )
            msg = ss.getvalue()
        raise ValueError(msg)
    
    keyboard.press_and_release('ctrl+w') # close the file