pythonselenium-webdriverweb-scrapingbeautifulsoup

Webscraping a Website with a paginated table but no next button


I am trying to scrape the data from the first 3 pages of the paginated table here: https://www.fda.gov/safety/recalls-market-withdrawals-safety-alerts . So far I can only get data from the first page. (They do have an API but it only updates weekly which is not frequent enough for me).

This is what I have:

from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import time
import json

# Base and target URLs
root = 'https://www.fda.gov'
website = f'{root}/safety/recalls-market-withdrawals-safety-alerts'

https://www.fda.gov/safety/recalls-market-withdrawals-safety-alerts

# Set up Selenium WebDriver
driver = webdriver.Chrome()
driver.get(website)

# Select "Food & Beverages" filter
dropdown = Select(driver.find_element("id", "edit-field-regulated-product-field"))
dropdown.select_by_value("2323")  # 2323 corresponds to Food & Beverages

time.sleep(2)  # Wait for the page to load

# Initialize data storage
recall_data = []
page_count = 0
max_pages = 1  # Set your page limit here

while page_count < max_pages:
    # Parse the page content
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Locate the table
    table = soup.find('table', {'class': 'table'})
    if not table:
        break

    # Extract data from the current page
    rows = table.find_all('tr')[1:]  # Skip header row
    for row in rows:
        cols = row.find_all('td')
        if len(cols) > 1:
            recall_info = {
                'Date': cols[0].text.strip(),
                'Brand Names': cols[1].text.strip(),
                'Product Description': cols[2].text.strip(),
                'Product Type': cols[3].text.strip(),
                'Recall Reason Description': cols[4].text.strip(),
                'Company Name': cols[5].text.strip(),
                'Terminated Recall': cols[6].text.strip(),
            }
            recall_data.append(recall_info)

    # Check for the "Next" button
    try:
        next_button = driver.find_element("xpath", "//a[contains(@class, 'sNext:Next')]")
        next_button.click()
        page_count += 1
        time.sleep(2)  # Wait for the next page to load
    except Exception as e:
               print("Next button not found or click failed, ending pagination.")
               break

import csv

# Save data to CSV
csv_filename = 'recalls.csv'

# Define CSV header
csv_headers = [
    'Date', 
    'Brand Names', 
    'Product Description', 
    'Product Type', 
    'Recall Reason Description', 
    'Company Name', 
    'Terminated Recall'
]

with open(csv_filename, 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=csv_headers)
    
    # Write header
    writer.writeheader()
    
    # Write rows
    writer.writerows(recall_data)

print(f"Data has been saved to {csv_filename}")

# Close the driver
driver.quit()

I have tried: checking for a next button, adding 'per_page=', '/max_rows' in the URL, adding page-count / page numbers in the code, but so far I can't get beyond the first page. There is a Next button but it doesn't have that designation in the HTML code.


Solution

  • You are not getting the next button because the selector you are using can't find the button. When inspected, I found that the <a> node doesn't have any class yet you are trying to find the element by the class. Try the CSS selector #datatable_next > a or XPath //li[@id='datatable_next']/a for the next button.

    Try to use the following code, where I have added web driver wait so that it waits for the data table to load before starting to parse the data.

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import Select
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from bs4 import BeautifulSoup
    import time
    import json
    
    # Base and target URLs
    root = 'https://www.fda.gov'
    website = f'{root}/safety/recalls-market-withdrawals-safety-alerts'
    
    # Set up Selenium WebDriver
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 10)
    driver.get(website)
    
    # Select "Food & Beverages" filter
    dropdown = Select(driver.find_element("id", "edit-field-regulated-product-field"))
    dropdown.select_by_value("2323")  # 2323 corresponds to Food & Beverages
    
    # waiting for the spinner to hide (page load)
    wait.until(EC.invisibility_of_element_located((By.ID,"datatable_processing")))
    
    # Initialize data storage
    recall_data = []
    page_count = 0
    max_pages = 1  # Set your page limit here
    
    while page_count < max_pages:
        # Parse the page content
        soup = BeautifulSoup(driver.page_source, 'html.parser')
    
        # Locate the table
        table = soup.find('table', {'class': 'table'})
        if not table:
            break
    
        # Extract data from the current page
        rows = table.find_all('tr')[1:]  # Skip header row
        for row in rows:
            cols = row.find_all('td')
            if len(cols) > 1:
                recall_info = {
                    'Date': cols[0].text.strip(),
                    'Brand Names': cols[1].text.strip(),
                    'Product Description': cols[2].text.strip(),
                    'Product Type': cols[3].text.strip(),
                    'Recall Reason Description': cols[4].text.strip(),
                    'Company Name': cols[5].text.strip(),
                    'Terminated Recall': cols[6].text.strip(),
                }
                recall_data.append(recall_info)
    
        # Check for the "Next" button
        try:
            #The xpath was not valid for the element. This is the main change
            next_button = driver.find_element("xpath", "//li[@id='datatable_next']/a")
            next_button.click()
            page_count += 1
            time.sleep(2)  # Wait for the next page to load
        except Exception as e:
            print("Next button not found or click failed, ending pagination.")
            break
    
    import csv
    
    # Save data to CSV
    csv_filename = 'recalls.csv'
    
    # Define CSV header
    csv_headers = [
        'Date',
        'Brand Names',
        'Product Description',
        'Product Type',
        'Recall Reason Description',
        'Company Name',
        'Terminated Recall'
    ]
    
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=csv_headers)
    
        # Write header
        writer.writeheader()
    
        # Write rows
        writer.writerows(recall_data)
    
    print(f"Data has been saved to {csv_filename}")
    
    # Close the driver
    driver.quit()