python selenium-webdriver web-scraping beautifulsoup

Webscraping a Website with a paginated table but no next button

I am trying to scrape the data from the first 3 pages of the paginated table here: https://www.fda.gov/safety/recalls-market-withdrawals-safety-alerts . So far I can only get data from the first page. (They do have an API but it only updates weekly which is not frequent enough for me).

This is what I have:

from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import time
import json

# Base and target URLs
root = 'https://www.fda.gov'
website = f'{root}/safety/recalls-market-withdrawals-safety-alerts'

https://www.fda.gov/safety/recalls-market-withdrawals-safety-alerts

# Set up Selenium WebDriver
driver = webdriver.Chrome()
driver.get(website)

# Select "Food & Beverages" filter
dropdown = Select(driver.find_element("id", "edit-field-regulated-product-field"))
dropdown.select_by_value("2323")  # 2323 corresponds to Food & Beverages

time.sleep(2)  # Wait for the page to load

# Initialize data storage
recall_data = []
page_count = 0
max_pages = 1  # Set your page limit here

while page_count < max_pages:
    # Parse the page content
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Locate the table
    table = soup.find('table', {'class': 'table'})
    if not table:
        break

    # Extract data from the current page
    rows = table.find_all('tr')[1:]  # Skip header row
    for row in rows:
        cols = row.find_all('td')
        if len(cols) > 1:
            recall_info = {
                'Date': cols[0].text.strip(),
                'Brand Names': cols[1].text.strip(),
                'Product Description': cols[2].text.strip(),
                'Product Type': cols[3].text.strip(),
                'Recall Reason Description': cols[4].text.strip(),
                'Company Name': cols[5].text.strip(),
                'Terminated Recall': cols[6].text.strip(),
            }
            recall_data.append(recall_info)

    # Check for the "Next" button
    try:
        next_button = driver.find_element("xpath", "//a[contains(@class, 'sNext:Next')]")
        next_button.click()
        page_count += 1
        time.sleep(2)  # Wait for the next page to load
    except Exception as e:
               print("Next button not found or click failed, ending pagination.")
               break

import csv

# Save data to CSV
csv_filename = 'recalls.csv'

# Define CSV header
csv_headers = [
    'Date', 
    'Brand Names', 
    'Product Description', 
    'Product Type', 
    'Recall Reason Description', 
    'Company Name', 
    'Terminated Recall'
]

with open(csv_filename, 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=csv_headers)
    
    # Write header
    writer.writeheader()
    
    # Write rows
    writer.writerows(recall_data)

print(f"Data has been saved to {csv_filename}")

# Close the driver
driver.quit()

I have tried: checking for a next button, adding 'per_page=', '/max_rows' in the URL, adding page-count / page numbers in the code, but so far I can't get beyond the first page. There is a Next button but it doesn't have that designation in the HTML code.

Solution

You are not getting the next button because the selector you are using can't find the button. When inspected, I found that the <a> node doesn't have any class yet you are trying to find the element by the class. Try the CSS selector #datatable_next > a or XPath //li[@id='datatable_next']/a for the next button.

Try to use the following code, where I have added web driver wait so that it waits for the data table to load before starting to parse the data.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import json

# Base and target URLs
root = 'https://www.fda.gov'
website = f'{root}/safety/recalls-market-withdrawals-safety-alerts'

# Set up Selenium WebDriver
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get(website)

# Select "Food & Beverages" filter
dropdown = Select(driver.find_element("id", "edit-field-regulated-product-field"))
dropdown.select_by_value("2323")  # 2323 corresponds to Food & Beverages

# waiting for the spinner to hide (page load)
wait.until(EC.invisibility_of_element_located((By.ID,"datatable_processing")))

# Initialize data storage
recall_data = []
page_count = 0
max_pages = 1  # Set your page limit here

while page_count < max_pages:
    # Parse the page content
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Locate the table
    table = soup.find('table', {'class': 'table'})
    if not table:
        break

    # Extract data from the current page
    rows = table.find_all('tr')[1:]  # Skip header row
    for row in rows:
        cols = row.find_all('td')
        if len(cols) > 1:
            recall_info = {
                'Date': cols[0].text.strip(),
                'Brand Names': cols[1].text.strip(),
                'Product Description': cols[2].text.strip(),
                'Product Type': cols[3].text.strip(),
                'Recall Reason Description': cols[4].text.strip(),
                'Company Name': cols[5].text.strip(),
                'Terminated Recall': cols[6].text.strip(),
            }
            recall_data.append(recall_info)

    # Check for the "Next" button
    try:
        #The xpath was not valid for the element. This is the main change
        next_button = driver.find_element("xpath", "//li[@id='datatable_next']/a")
        next_button.click()
        page_count += 1
        time.sleep(2)  # Wait for the next page to load
    except Exception as e:
        print("Next button not found or click failed, ending pagination.")
        break

import csv

# Save data to CSV
csv_filename = 'recalls.csv'

# Define CSV header
csv_headers = [
    'Date',
    'Brand Names',
    'Product Description',
    'Product Type',
    'Recall Reason Description',
    'Company Name',
    'Terminated Recall'
]

with open(csv_filename, 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=csv_headers)

    # Write header
    writer.writeheader()

    # Write rows
    writer.writerows(recall_data)

print(f"Data has been saved to {csv_filename}")

# Close the driver
driver.quit()