I want to create a program like FireShot (premium version) to take a webpage on chromedriver and convert it into a pdf.
Currently this is the code I came up with:
import time
import os
import glob
import base64
from PyPDF2 import PdfMerger
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
prefs = {"printing.print_preview_sticky_settings.appState": '{"recentDestinations":[{"id":"Save as PDF"}]}'}
options.add_experimental_option("prefs", prefs)
options.add_argument("--kiosk-printing") # Auto confirm print
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
# Load the webpage
driver.get("https://www.coursera.org/?authMode=login")
# complete the sign-in and you are redirected to another page
driver.switch_to.window(driver.window_handles[1])
def save_pdf(driver, file_name):
params = {'landscape': False, 'paperWidth': 8.27, 'paperHeight': 11.69}
data = driver.execute_cdp_cmd("Page.printToPDF", params)
with open(file_name, 'wb') as file:
file.write(base64.b64decode(data['data']))
def scroll_and_save(driver, scrollable_xpath, output_prefix):
scrollable_div = driver.find_element(By.XPATH, scrollable_xpath)
file_list = []
page_num = 1
last_scroll_position = -1
while True:
file_name = f"{output_prefix}_page_{page_num}.pdf"
save_pdf(driver, file_name)
file_list.append(file_name)
driver.execute_script("arguments[0].scrollTop += arguments[0].clientHeight;", scrollable_div)
time.sleep(2) # Allow time for new content to load
new_scroll_position = driver.execute_script("return arguments[0].scrollTop;", scrollable_div)
if new_scroll_position == last_scroll_position:
break # Stop when scrolling reaches the end
last_scroll_position = new_scroll_position
page_num += 1
return file_list
def merge_pdfs(file_list, output_file):
merger = PdfMerger()
for pdf in file_list:
merger.append(pdf)
merger.write(output_file)
merger.close()
# Clean up individual PDF files
for pdf in file_list:
os.remove(pdf)
scrollable_xpath = "/html/body/div[5]/div/div/div/div[2]/div[2]/div"
output_prefix = "practical_quiz_1"
file_list = scroll_and_save(driver, scrollable_xpath, output_prefix)
merge_pdfs(file_list, output_prefix + ".pdf")
Currently the error I am facing is that data = driver.execute_cdp_cmd("Page.printToPDF", params)
only saves the top part of the webpage as a PDF even though I scroll down - how do I resolve this and maintain the text searchability in the PDF.
I was unable to recreate this same exact issue for another website; for other websites when you screenshot using driver.execute_cdp_cmd("Page.printToPDF", params)
the screenshot stores the entire webpage with no need to scroll - so not sure why it didn't work for Coursera.
So to resolve, I changed the params being passed into this call and the zoom:
driver.execute_script("document.body.style.zoom='90%'")
params = {'landscape': False, 'paperWidth': 12, 'paperHeight': 25}
data = driver.execute_cdp_cmd("Page.printToPDF", params)
To resize the PDF for rendering it on GitHub, I added:
import fitz # PyMuPDF
src = fitz.open("course_1/week_1/practical_quiz_1.pdf") # Open source PDF
doc = fitz.open() # Create a new PDF document
for ipage in src:
rect = ipage.rect # Get original page dimensions
crop_rect = fitz.Rect(rect.x0 + 30, rect.y0, rect.x1 - 30, rect.y1) # Adjust left/right padding
if rect.width > rect.height:
fmt = fitz.paper_size("a4-l") # Landscape A4
else:
fmt = fitz.paper_size("a4") # Portrait A4
page = doc.new_page(width=fmt[0], height=fmt[1]) # Create new page
page.show_pdf_page(page.rect, src, ipage.number, clip=crop_rect) # Apply cropping
doc.save("course_1/week_1/practical_quiz_1_a4.pdf")
doc.close()
This seemed to do the trick.
Code: https://github.com/psymbio/math_ml/blob/main/coursera_pdf_maker.ipynb
PDF Custom: https://github.com/psymbio/math_ml/blob/main/course_1/week_1/practical_quiz_1.pdf
PDF A4: https://github.com/psymbio/math_ml/blob/main/course_1/week_1/practical_quiz_1_a4.pdf