pythonselenium-webdriverpdf

How to create a searchable PDF using Python and Selenium?


I want to create a program like FireShot (premium version) to take a webpage on chromedriver and convert it into a pdf.

Currently this is the code I came up with:

import time
import os
import glob
import base64
from PyPDF2 import PdfMerger
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
prefs = {"printing.print_preview_sticky_settings.appState": '{"recentDestinations":[{"id":"Save as PDF"}]}'}
options.add_experimental_option("prefs", prefs)
options.add_argument("--kiosk-printing")  # Auto confirm print
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Load the webpage
driver.get("https://www.coursera.org/?authMode=login")

# complete the sign-in and you are redirected to another page

driver.switch_to.window(driver.window_handles[1])

def save_pdf(driver, file_name):
    params = {'landscape': False, 'paperWidth': 8.27, 'paperHeight': 11.69}
    data = driver.execute_cdp_cmd("Page.printToPDF", params)
    with open(file_name, 'wb') as file:
        file.write(base64.b64decode(data['data']))

def scroll_and_save(driver, scrollable_xpath, output_prefix):
    scrollable_div = driver.find_element(By.XPATH, scrollable_xpath)
    file_list = []
    page_num = 1
    last_scroll_position = -1
    
    while True:
        file_name = f"{output_prefix}_page_{page_num}.pdf"
        save_pdf(driver, file_name)
        file_list.append(file_name)
        
        driver.execute_script("arguments[0].scrollTop += arguments[0].clientHeight;", scrollable_div)
        time.sleep(2)  # Allow time for new content to load
        
        new_scroll_position = driver.execute_script("return arguments[0].scrollTop;", scrollable_div)
        if new_scroll_position == last_scroll_position:
            break  # Stop when scrolling reaches the end
        last_scroll_position = new_scroll_position
        page_num += 1
    
    return file_list

def merge_pdfs(file_list, output_file):
    merger = PdfMerger()
    for pdf in file_list:
        merger.append(pdf)
    merger.write(output_file)
    merger.close()
    
    # Clean up individual PDF files
    for pdf in file_list:
        os.remove(pdf)

scrollable_xpath = "/html/body/div[5]/div/div/div/div[2]/div[2]/div"
output_prefix = "practical_quiz_1"

file_list = scroll_and_save(driver, scrollable_xpath, output_prefix)
merge_pdfs(file_list, output_prefix + ".pdf")

Currently the error I am facing is that data = driver.execute_cdp_cmd("Page.printToPDF", params) only saves the top part of the webpage as a PDF even though I scroll down - how do I resolve this and maintain the text searchability in the PDF.


Solution

  • I was unable to recreate this same exact issue for another website; for other websites when you screenshot using driver.execute_cdp_cmd("Page.printToPDF", params) the screenshot stores the entire webpage with no need to scroll - so not sure why it didn't work for Coursera.

    So to resolve, I changed the params being passed into this call and the zoom:

    driver.execute_script("document.body.style.zoom='90%'")
    params = {'landscape': False, 'paperWidth': 12, 'paperHeight': 25}
    data = driver.execute_cdp_cmd("Page.printToPDF", params)
    

    To resize the PDF for rendering it on GitHub, I added:

    import fitz  # PyMuPDF
    
    src = fitz.open("course_1/week_1/practical_quiz_1.pdf")  # Open source PDF
    doc = fitz.open()  # Create a new PDF document
    
    for ipage in src:
        rect = ipage.rect  # Get original page dimensions
        crop_rect = fitz.Rect(rect.x0 + 30, rect.y0, rect.x1 - 30, rect.y1)  # Adjust left/right padding
    
        if rect.width > rect.height:
            fmt = fitz.paper_size("a4-l")  # Landscape A4
        else:
            fmt = fitz.paper_size("a4")  # Portrait A4
        
        page = doc.new_page(width=fmt[0], height=fmt[1])  # Create new page
        page.show_pdf_page(page.rect, src, ipage.number, clip=crop_rect)  # Apply cropping
    
    doc.save("course_1/week_1/practical_quiz_1_a4.pdf")
    doc.close()
    

    This seemed to do the trick.

    Code: https://github.com/psymbio/math_ml/blob/main/coursera_pdf_maker.ipynb

    PDF Custom: https://github.com/psymbio/math_ml/blob/main/course_1/week_1/practical_quiz_1.pdf

    PDF A4: https://github.com/psymbio/math_ml/blob/main/course_1/week_1/practical_quiz_1_a4.pdf