I am trying to scrap reviews from the glassdoor. I could scrap text reviews, but I have trouble scraping the recommendation (Yes/No). example URL is https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036.htm. Here is the screenshot of what I am working on.
Here, I scraped pros and cons, and want to get recommendation. Checked for recommendation, their d attribute has 8.835 and no has 18.299. I have no problem with the other part but only for # 3. Scrape Recommendation part. Target XPath example is
//*[@id="empReview_9142916"]/div[2]/div[2]/div[1]/svg/path
10 reviews are gathered from each page, but 0 for recommendations in
print(svg_elements)
It shows an empty list. Below is my current code, removing my ID and password. Thank you in advance for your help.
import csv
import time
from seleniumbase import SB
from selenium.webdriver.common.by import By
def scrape_stackoverflow_cloudflare_and_save_csv(csv_filename="cloudflare_questions.csv"):
"""
Scrapes text from Glassdoor reviews pages (pros, cons, recommendations) for pages 1 to 5, then saves data to a CSV file.
"""
try:
with SB(uc=True) as sb:
base_url = "https://www.glassdoor.com"
start_url = "https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036.htm"
all_pros = []
all_cons = []
all_recommendations = []
# Loop through pages 1 to 5
for page_num in range(1, 6):
print(f"Scraping page {page_num}...")
if page_num == 1:
sb.uc_open_with_reconnect(start_url, 6)
else:
next_page_link = f"/Reviews/Amazon-Reviews-E6036_P{page_num}.htm"
sb.open(base_url + next_page_link)
if page_num == 2:
email_input = sb.find_element('input[data-test="emailInput-input"]')
email_input.send_keys("my id")
sb.sleep(2) # Wait for the email to be entered
continue_button = sb.find_element('button[data-test="email-form-button"]')
continue_button.click()
sb.sleep(2) # Wait for the next page to load
password_input = sb.find_element('input[data-test="passwordInput-input"]')
password_input.send_keys("my password")
sb.sleep(2) # Wait for the password to be entered
sign_in_button = sb.find_element('button[data-role-variant="primary"][type="submit"]')
sign_in_button.click()
sb.sleep(2) # Wait for the sign-in process to complete
sb.uc_gui_click_captcha()
sb.sleep(4) # Wait for the page to load
# 1. Scrape PROS
pros_elements = sb.find_elements('span[data-test="review-text-PROS"]')
pros_texts = [elem.text.strip() for elem in pros_elements if elem.text.strip()]
# 2. Scrape CONS
cons_elements = sb.find_elements('span[data-test="review-text-CONS"]')
cons_texts = [elem.text.strip() for elem in cons_elements if elem.text.strip()]
# 3. Scrape Recommendations (Yes/No)
svg_elements = sb.find_elements(By.XPATH, '//div[contains(@id, "empReview")]/div[2]/div[2]/div[1]/svg/path')
recommendations = []
for svg in svg_elements:
d_attribute = svg.get_attribute('d')
if d_attribute:
if '8.835 17.64' in d_attribute: # Unique part of the "Yes" SVG
recommendations.append('Yes')
elif '18.299 5.327' in d_attribute: # Unique part of the "No" SVG
recommendations.append('No')
# Collect data from this page
all_pros.extend(pros_texts)
all_cons.extend(cons_texts)
all_recommendations.extend(recommendations)
# Debug: Print collected data for this page
print(f"Page {page_num} - Pros: {len(pros_texts)}, Cons: {len(cons_texts)}, Recommendations: {len(recommendations)}")
# Save all collected data to CSV
print("Saving data to CSV...")
with open(csv_filename, mode="w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["pros_text", "cons_text", "recommendation"]) # Add "recommendation" here
for pros, cons, rec in zip(all_pros, all_cons, all_recommendations):
writer.writerow([pros, cons, rec])
print("Scraping completed successfully!")
except Exception as e:
print(f"An error occurred: {e}")
finally:
print("Exiting function (finally block).")
# Example usage:
if __name__ == "__main__":
scrape_stackoverflow_cloudflare_and_save_csv()
Instead of using svg values to check if it is recommended, try to check the class for the div that clearly states if it is positive, negative, or neutral. There are neutral and nodata ratings as well.
I have modified the recommendation checking part only. You have to check for neutral and no data.
Try this:
elements = sb.find_elements(By.XPATH, '//span[text()="Recommend"]/parent::div')
recommendations = []
for elem in elements:
attribute = elem.get_attribute('class')
if 'positiveStyles' in attribute:
recommendations.append('Yes')
elif 'negativeStyles' in attribute:
recommendations.append('No')
It should give you the recommended and not recommended reviews.