I am trying to scrape the data from the first 3 pages of the paginated table here: https://www.fda.gov/safety/recalls-market-withdrawals-safety-alerts . So far I can only get data from the first page. (They do have an API but it only updates weekly which is not frequent enough for me).
This is what I have:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import time
import json
# Base and target URLs
root = 'https://www.fda.gov'
website = f'{root}/safety/recalls-market-withdrawals-safety-alerts'
https://www.fda.gov/safety/recalls-market-withdrawals-safety-alerts
# Set up Selenium WebDriver
driver = webdriver.Chrome()
driver.get(website)
# Select "Food & Beverages" filter
dropdown = Select(driver.find_element("id", "edit-field-regulated-product-field"))
dropdown.select_by_value("2323") # 2323 corresponds to Food & Beverages
time.sleep(2) # Wait for the page to load
# Initialize data storage
recall_data = []
page_count = 0
max_pages = 1 # Set your page limit here
while page_count < max_pages:
# Parse the page content
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Locate the table
table = soup.find('table', {'class': 'table'})
if not table:
break
# Extract data from the current page
rows = table.find_all('tr')[1:] # Skip header row
for row in rows:
cols = row.find_all('td')
if len(cols) > 1:
recall_info = {
'Date': cols[0].text.strip(),
'Brand Names': cols[1].text.strip(),
'Product Description': cols[2].text.strip(),
'Product Type': cols[3].text.strip(),
'Recall Reason Description': cols[4].text.strip(),
'Company Name': cols[5].text.strip(),
'Terminated Recall': cols[6].text.strip(),
}
recall_data.append(recall_info)
# Check for the "Next" button
try:
next_button = driver.find_element("xpath", "//a[contains(@class, 'sNext:Next')]")
next_button.click()
page_count += 1
time.sleep(2) # Wait for the next page to load
except Exception as e:
print("Next button not found or click failed, ending pagination.")
break
import csv
# Save data to CSV
csv_filename = 'recalls.csv'
# Define CSV header
csv_headers = [
'Date',
'Brand Names',
'Product Description',
'Product Type',
'Recall Reason Description',
'Company Name',
'Terminated Recall'
]
with open(csv_filename, 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=csv_headers)
# Write header
writer.writeheader()
# Write rows
writer.writerows(recall_data)
print(f"Data has been saved to {csv_filename}")
# Close the driver
driver.quit()
I have tried: checking for a next button, adding 'per_page=', '/max_rows' in the URL, adding page-count / page numbers in the code, but so far I can't get beyond the first page. There is a Next button but it doesn't have that designation in the HTML code.
You are not getting the next button because the selector you are using can't find the button. When inspected, I found that the <a>
node doesn't have any class yet you are trying to find the element by the class.
Try the CSS selector #datatable_next > a
or XPath //li[@id='datatable_next']/a
for the next button.
Try to use the following code, where I have added web driver wait so that it waits for the data table to load before starting to parse the data.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import json
# Base and target URLs
root = 'https://www.fda.gov'
website = f'{root}/safety/recalls-market-withdrawals-safety-alerts'
# Set up Selenium WebDriver
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get(website)
# Select "Food & Beverages" filter
dropdown = Select(driver.find_element("id", "edit-field-regulated-product-field"))
dropdown.select_by_value("2323") # 2323 corresponds to Food & Beverages
# waiting for the spinner to hide (page load)
wait.until(EC.invisibility_of_element_located((By.ID,"datatable_processing")))
# Initialize data storage
recall_data = []
page_count = 0
max_pages = 1 # Set your page limit here
while page_count < max_pages:
# Parse the page content
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Locate the table
table = soup.find('table', {'class': 'table'})
if not table:
break
# Extract data from the current page
rows = table.find_all('tr')[1:] # Skip header row
for row in rows:
cols = row.find_all('td')
if len(cols) > 1:
recall_info = {
'Date': cols[0].text.strip(),
'Brand Names': cols[1].text.strip(),
'Product Description': cols[2].text.strip(),
'Product Type': cols[3].text.strip(),
'Recall Reason Description': cols[4].text.strip(),
'Company Name': cols[5].text.strip(),
'Terminated Recall': cols[6].text.strip(),
}
recall_data.append(recall_info)
# Check for the "Next" button
try:
#The xpath was not valid for the element. This is the main change
next_button = driver.find_element("xpath", "//li[@id='datatable_next']/a")
next_button.click()
page_count += 1
time.sleep(2) # Wait for the next page to load
except Exception as e:
print("Next button not found or click failed, ending pagination.")
break
import csv
# Save data to CSV
csv_filename = 'recalls.csv'
# Define CSV header
csv_headers = [
'Date',
'Brand Names',
'Product Description',
'Product Type',
'Recall Reason Description',
'Company Name',
'Terminated Recall'
]
with open(csv_filename, 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=csv_headers)
# Write header
writer.writeheader()
# Write rows
writer.writerows(recall_data)
print(f"Data has been saved to {csv_filename}")
# Close the driver
driver.quit()