pythonselenium-webdriverweb-scrapingbeautifulsouptripadvisor

I'm having issues with a python program for scraping TripAdvisor Reviews


The following python program is for scraping a selection of review data from all reviews on our trip advisor page. It isn't working and I'm unable to trouble shoot it. It doesn't iterate through the pages, so stops after the first page of reviews. The files it outputs are empty. Here is the code:

import time
import json
import csv
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# Define the base URL of the TripAdvisor page
base_url = "https://www.tripadvisor.co.uk/Attraction_Review-g186225-d213774-Reviews-Scudamore_s_Punting_Company-Cambridge_Cambridgeshire_England.html"

# Set the checkpoint file name
checkpoint_file = "checkpoint.txt"

# Check if the checkpoint file exists
if os.path.exists(checkpoint_file):
    # Read the checkpoint file to get the last page scraped
    with open(checkpoint_file, "r") as f:
        last_page = int(f.read().strip())
else:
    last_page = 0

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--start-maximized")  # Maximize the browser window

# Launch Selenium with the configured options
driver = webdriver.Chrome(options=chrome_options)  # Replace with the path to your Chrome WebDriver

# Send a GET request to the base URL
try:
    driver.get(base_url)
    time.sleep(10)  # Wait for the page to load

    # Find and click the "Accept Cookies" button using JavaScript injection
    driver.execute_script('document.getElementById("onetrust-accept-btn-handler").click();')
    time.sleep(5)  # Wait for the cookies to be accepted

    # Get the page source after accepting cookies
    page_source = driver.page_source

    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(page_source, "html.parser")

    # Find the total number of pages
    page_count_element = soup.find("a", class_="pageNum last")
    if page_count_element:
        page_count = int(page_count_element.text.strip())
    else:
        page_count = 1  # Set a default value if the element is not found

    # Adjust the page count based on the last page scraped
    page_count -= last_page

    # Create a list to store the extracted review data
    review_data = []

    # Iterate over each page
    for page in range(last_page + 1, last_page + page_count + 1):
        # Construct the URL for the current page
        url = f"{base_url[:-5]}-or{page * 10}-{base_url[-5:]}"

        # Send a GET request to the URL
        driver.get(url)
        time.sleep(10)  # Wait for the page to load

        # Get the page source
        page_source = driver.page_source

        # Parse the page source with BeautifulSoup
        soup = BeautifulSoup(page_source, "html.parser")

        # Find all the review elements on the current page
        review_elements = soup.select("#tab-data-qa-reviews-0 > div > div.LbPSX > div > div")

        # Iterate over each review element and extract relevant information
        for review in review_elements:
            try:
                # Extract the reviewer's name
                reviewer_name_elem = review.select_one(".ZVAUHZqh > span > a")
                reviewer_name = reviewer_name_elem.text.strip() if reviewer_name_elem else ""

                # Extract the rating
                rating_elem = review.select_one(".nf9vGX55")
                rating = float(rating_elem["title"].split()[0]) / 10 if rating_elem else 0.0

                # Extract the title
                title_elem = review.select_one("a._1r_My98y > span")
                title = title_elem.text.strip() if title_elem else ""

                # Extract the review text
                review_text_elem = review.select_one("div._1o6B68z4 > div")
                review_text = review_text_elem.text.strip() if review_text_elem else ""

                # Extract the review date
                review_date_elem = review.select_one("div._1OuugO9R")
                review_date = review_date_elem.text.strip() if review_date_elem else ""

                # Create a dictionary to store the review data
                review_info = {
                    "Reviewer Name": reviewer_name,
                    "Rating": rating,
                    "Title": title,
                    "Review Text": review_text,
                    "Review Date": review_date
                }

                # Add the review data to the list
                review_data.append(review_info)

            except Exception as e:
                print("An error occurred while extracting review data:", str(e))

    # Save the checkpoint for the last page scraped
    with open(checkpoint_file, "w") as f:
        f.write(str(last_page + page_count))

    # Close the Selenium driver
    driver.quit()

    # Save the review data to a JSON file
    with open("reviews.json", "w", encoding="utf-8") as json_file:
        json.dump(review_data, json_file, ensure_ascii=False, indent=4)

    # Save the review data to a CSV file
    with open("reviews.csv", "w", encoding="utf-8", newline="") as csv_file:
        fieldnames = ["Reviewer Name", "Rating", "Title", "Review Text", "Review Date"]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(review_data)

    # Save the review text to a TXT file
    with open("reviews.txt", "w", encoding="utf-8") as txt_file:
        for review in review_data:
            txt_file.write(review["Review Text"] + "\n")

    print("Scraping completed successfully!")
except Exception as e:
    print("An error occurred during scraping:", str(e))
    driver.quit()

I've tried using requests but wasn't happy with the results. I've gone over the selectors trying html, css and xpath selectors to grab the data I want to scrape. I've changed output types. I've tried throttling the requests so it doesn't trigger anti-scraping security on TripAdvisor. I've tried a fair few things. Happy to discuss.

Over all I'm hoping the program will scrape all of the review fields I've defined into a file. there are over 1000 reviews and the program I created would populate the json with the review fields I specified for the first pages reviews however the fields have no data in them. I'm very open to suggestions at this point.


Solution

  • I couldn't debug your code, your code but I have successfully scraped tripadvisor before (with selenium) using a simple while loop as below.


    Using my selectForList function for convenience, with the following selectors:

    nxt_pg_sel = 'a[href][data-smoke-attr="pagination-next-arrow"]'
    review_sel = 'div[data-automation="reviewCard"]'
    rev_dets_sel = {
        'from_page': ('', '"staticVal"'),
        'profile_name': 'span>a[href^="\/Profile\/"]',
        'profile_link': ('span>a[href^="\/Profile\/"]', 'href'),
        'about_reviewer': 'span:has(>a[href^="\/Profile\/"])+div',
        'review_votes': 'button[aria-label="Click to add helpful vote"]>span',
        'bubbles': ('svg[aria-label$=" of 5 bubbles"]', 'aria-label'),
        'review_link': ('a[href^="\/ShowUserReviews-"]', 'href'),
        'review_title': 'a[href^="\/ShowUserReviews-"]',
        'about_review': 'div:has(>a[href^="/ShowUserReviews-"])+div:not(:has(div))',
        'review_body': 'div:has(>a[href^="/ShowUserReviews-"])~div>div',
        'review_date': 'div:has(>a[href^="/ShowUserReviews-"])~div:last-child>div',
    }
    

    For your specific page:

    csv_fn_revs = 'Scudamore_s_Punting_Company-tripadvisor_reviews.csv'
    csv_fn_pgs = 'Scudamore_s_Punting_Company-tripadvisor_review_pages.csv'
    pgNum, maxPages = 0, None
    pageUrl = 'https://www.tripadvisor.co.uk/Attraction_Review-g186225-d213774-Reviews-Scudamore_s_Punting_Company-Cambridge_Cambridgeshire_England.html'
    

    And the rest is as I did before with that simple while loop:

    browser = webdriver.Chrome()
    browser.maximize_window() # maximize window
    
    reveiws_list, pgList = [], []
    while pageUrl and (maxPages is None or pgNum < maxPages):
        pgNum += 1
        pgList.append({'page': pgNum, 'URL': pageUrl})
        try:
            browser.get(pageUrl)
            rev_dets_sel['from_page'] = (pgNum, '"staticVal"')
            pgSoup = BeautifulSoup(browser.page_source, 'html.parser')
    
            rev_cards = pgSoup.select(review_sel)
            reveiws_list += [selectForList(r, rev_dets_sel) for r in rev_cards]
            pgList[-1]['reviews'] = len(rev_cards)
    
            next_page = pgSoup.select_one(nxt_pg_sel)
            if next_page:
                pageUrl = 'https://www.tripadvisor.co.uk' + next_page.get('href')
                pgList[-1]['next_page'] = pageUrl
                print('going to', pageUrl)
            else:
                pageUrl = None  # stop condition
        except Exception as e:
            print(f'Stopping on pg{pgNum} due to {type(e)}:\n{e}')
            break
    
    browser.quit() # Close the browser
    
    # Save as csv
    pd.DataFrame(reveiws_list).to_csv(csv_fn_revs, index=False)
    pd.DataFrame(pgList).to_csv(csv_fn_pgs, index=False)
    

    It's a bit different from how you did it, and the last column [review_date] isn't perfect, but I was able to scrape all 1621 reviews from your link (see full results in this spreadsheet)

    pglogs

    revs