The following python program is for scraping a selection of review data from all reviews on our trip advisor page. It isn't working and I'm unable to trouble shoot it. It doesn't iterate through the pages, so stops after the first page of reviews. The files it outputs are empty. Here is the code:
import time
import json
import csv
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
# Define the base URL of the TripAdvisor page
base_url = "https://www.tripadvisor.co.uk/Attraction_Review-g186225-d213774-Reviews-Scudamore_s_Punting_Company-Cambridge_Cambridgeshire_England.html"
# Set the checkpoint file name
checkpoint_file = "checkpoint.txt"
# Check if the checkpoint file exists
if os.path.exists(checkpoint_file):
# Read the checkpoint file to get the last page scraped
with open(checkpoint_file, "r") as f:
last_page = int(f.read().strip())
else:
last_page = 0
# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--start-maximized") # Maximize the browser window
# Launch Selenium with the configured options
driver = webdriver.Chrome(options=chrome_options) # Replace with the path to your Chrome WebDriver
# Send a GET request to the base URL
try:
driver.get(base_url)
time.sleep(10) # Wait for the page to load
# Find and click the "Accept Cookies" button using JavaScript injection
driver.execute_script('document.getElementById("onetrust-accept-btn-handler").click();')
time.sleep(5) # Wait for the cookies to be accepted
# Get the page source after accepting cookies
page_source = driver.page_source
# Parse the page source with BeautifulSoup
soup = BeautifulSoup(page_source, "html.parser")
# Find the total number of pages
page_count_element = soup.find("a", class_="pageNum last")
if page_count_element:
page_count = int(page_count_element.text.strip())
else:
page_count = 1 # Set a default value if the element is not found
# Adjust the page count based on the last page scraped
page_count -= last_page
# Create a list to store the extracted review data
review_data = []
# Iterate over each page
for page in range(last_page + 1, last_page + page_count + 1):
# Construct the URL for the current page
url = f"{base_url[:-5]}-or{page * 10}-{base_url[-5:]}"
# Send a GET request to the URL
driver.get(url)
time.sleep(10) # Wait for the page to load
# Get the page source
page_source = driver.page_source
# Parse the page source with BeautifulSoup
soup = BeautifulSoup(page_source, "html.parser")
# Find all the review elements on the current page
review_elements = soup.select("#tab-data-qa-reviews-0 > div > div.LbPSX > div > div")
# Iterate over each review element and extract relevant information
for review in review_elements:
try:
# Extract the reviewer's name
reviewer_name_elem = review.select_one(".ZVAUHZqh > span > a")
reviewer_name = reviewer_name_elem.text.strip() if reviewer_name_elem else ""
# Extract the rating
rating_elem = review.select_one(".nf9vGX55")
rating = float(rating_elem["title"].split()[0]) / 10 if rating_elem else 0.0
# Extract the title
title_elem = review.select_one("a._1r_My98y > span")
title = title_elem.text.strip() if title_elem else ""
# Extract the review text
review_text_elem = review.select_one("div._1o6B68z4 > div")
review_text = review_text_elem.text.strip() if review_text_elem else ""
# Extract the review date
review_date_elem = review.select_one("div._1OuugO9R")
review_date = review_date_elem.text.strip() if review_date_elem else ""
# Create a dictionary to store the review data
review_info = {
"Reviewer Name": reviewer_name,
"Rating": rating,
"Title": title,
"Review Text": review_text,
"Review Date": review_date
}
# Add the review data to the list
review_data.append(review_info)
except Exception as e:
print("An error occurred while extracting review data:", str(e))
# Save the checkpoint for the last page scraped
with open(checkpoint_file, "w") as f:
f.write(str(last_page + page_count))
# Close the Selenium driver
driver.quit()
# Save the review data to a JSON file
with open("reviews.json", "w", encoding="utf-8") as json_file:
json.dump(review_data, json_file, ensure_ascii=False, indent=4)
# Save the review data to a CSV file
with open("reviews.csv", "w", encoding="utf-8", newline="") as csv_file:
fieldnames = ["Reviewer Name", "Rating", "Title", "Review Text", "Review Date"]
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(review_data)
# Save the review text to a TXT file
with open("reviews.txt", "w", encoding="utf-8") as txt_file:
for review in review_data:
txt_file.write(review["Review Text"] + "\n")
print("Scraping completed successfully!")
except Exception as e:
print("An error occurred during scraping:", str(e))
driver.quit()
I've tried using requests but wasn't happy with the results. I've gone over the selectors trying html, css and xpath selectors to grab the data I want to scrape. I've changed output types. I've tried throttling the requests so it doesn't trigger anti-scraping security on TripAdvisor. I've tried a fair few things. Happy to discuss.
Over all I'm hoping the program will scrape all of the review fields I've defined into a file. there are over 1000 reviews and the program I created would populate the json with the review fields I specified for the first pages reviews however the fields have no data in them. I'm very open to suggestions at this point.
I couldn't debug your code, your code but I have successfully scraped tripadvisor before (with selenium) using a simple while loop as below.
Using my selectForList
function for convenience, with the following selectors:
nxt_pg_sel = 'a[href][data-smoke-attr="pagination-next-arrow"]'
review_sel = 'div[data-automation="reviewCard"]'
rev_dets_sel = {
'from_page': ('', '"staticVal"'),
'profile_name': 'span>a[href^="\/Profile\/"]',
'profile_link': ('span>a[href^="\/Profile\/"]', 'href'),
'about_reviewer': 'span:has(>a[href^="\/Profile\/"])+div',
'review_votes': 'button[aria-label="Click to add helpful vote"]>span',
'bubbles': ('svg[aria-label$=" of 5 bubbles"]', 'aria-label'),
'review_link': ('a[href^="\/ShowUserReviews-"]', 'href'),
'review_title': 'a[href^="\/ShowUserReviews-"]',
'about_review': 'div:has(>a[href^="/ShowUserReviews-"])+div:not(:has(div))',
'review_body': 'div:has(>a[href^="/ShowUserReviews-"])~div>div',
'review_date': 'div:has(>a[href^="/ShowUserReviews-"])~div:last-child>div',
}
For your specific page:
csv_fn_revs = 'Scudamore_s_Punting_Company-tripadvisor_reviews.csv'
csv_fn_pgs = 'Scudamore_s_Punting_Company-tripadvisor_review_pages.csv'
pgNum, maxPages = 0, None
pageUrl = 'https://www.tripadvisor.co.uk/Attraction_Review-g186225-d213774-Reviews-Scudamore_s_Punting_Company-Cambridge_Cambridgeshire_England.html'
And the rest is as I did before with that simple while loop:
browser = webdriver.Chrome()
browser.maximize_window() # maximize window
reveiws_list, pgList = [], []
while pageUrl and (maxPages is None or pgNum < maxPages):
pgNum += 1
pgList.append({'page': pgNum, 'URL': pageUrl})
try:
browser.get(pageUrl)
rev_dets_sel['from_page'] = (pgNum, '"staticVal"')
pgSoup = BeautifulSoup(browser.page_source, 'html.parser')
rev_cards = pgSoup.select(review_sel)
reveiws_list += [selectForList(r, rev_dets_sel) for r in rev_cards]
pgList[-1]['reviews'] = len(rev_cards)
next_page = pgSoup.select_one(nxt_pg_sel)
if next_page:
pageUrl = 'https://www.tripadvisor.co.uk' + next_page.get('href')
pgList[-1]['next_page'] = pageUrl
print('going to', pageUrl)
else:
pageUrl = None # stop condition
except Exception as e:
print(f'Stopping on pg{pgNum} due to {type(e)}:\n{e}')
break
browser.quit() # Close the browser
# Save as csv
pd.DataFrame(reveiws_list).to_csv(csv_fn_revs, index=False)
pd.DataFrame(pgList).to_csv(csv_fn_pgs, index=False)
It's a bit different from how you did it, and the last column [review_date
] isn't perfect, but I was able to scrape all 1621 reviews from your link (see full results in this spreadsheet)