See latest update at bottom
Using Python 3.12.3 and Selenium, I'm trying to load more rows before scraping and am very new to the process. Ideally all or at least as many as possible, but the website may autolimit the total reviews on page at some point. By manually clicking, I have been able to click it atleast 10 more times without it failing. Any help would be appreciated. Please let me know if I can provide any other context.
Here is a screenshot of the section I'd like to click: Show More Button I'm consistently unable to click this section and I believe it's due to the :before psuedo-element, but additionally, there are a number of div.Button__containers making my attempts to click usually result in pressing buttons I don't mean to.
Here is the script I'm using to pull the reviews, but it's unable to actually click load more rows, so I'm hoping to bridge that gap using selenium:
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium import webdriver
from IPython.display import display, Image
driver = webdriver.Chrome()
title = driver.title
for i in range(0,1000,100):
#McCurdy - I'm Glad my Mom Died
response = requests.get('https://www.goodreads.com/book/show/26074156/reviews?reviewFilters={%22workId%22:%22kca://work/amzn1.gr.work.v1.FSsY8ohzUZCeEXoBsiEYqw%22,%22after%22:%22NjgxNSwxNTAwNjU3MjE4NDI1%22}')
print(response.status_code)
time.sleep(6)
doc = BeautifulSoup(response.text, 'html.parser')
df = pd.DataFrame(columns=['ReviewDate','ReviewerName','Rating','ReviewText','ReviewerMeta'])
ratings = []
# Loop through all elements found
for tag in book_tags_:
# Get the aria-label attribute from the current element
aria_label = tag.get('aria-label')
# Check if aria-label is not None and contains the expected format
if aria_label and 'Rating ' in aria_label and ' out of 5' in aria_label:
# Split the aria-label to extract the desired text
rating_text = aria_label.split('Rating ')[1].split(' out of 5')[0]
# Append the rating_text to the list of ratings
ratings.append(rating_text)
else:
print(f"Skipping element with aria-label: {aria_label}")
# Create a dataframe from the list of ratings
df = pd.DataFrame({'Rating': ratings})
#This is repeated for additional fields
Here is my code that I'm using to attempt a button press:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, ElementClickInterceptedException
# Initialize WebDriver
driver = webdriver.Chrome() # or another driver you're using
url = "https://www.goodreads.com/book/show/59364173/reviews?reviewFilters={%22workId%22:%22kca://work/amzn1.gr.work.v3.JeHZlXvg2e1mD9_k%22,%22after%22:%22MjYwMTYsMTY2MDc1MjY5MjY2Mw%22}"
# Open the page
driver.get(url)
time.sleep(20)
def click_show_more():
while True:
try:
# Scroll to the bottom of the page to ensure button is in view
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Find the 'Show More' button
show_more_button = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.Divider.Divider--contents.Divider--largeMargin > div.Button__container'))
)
# Scroll the element into view
driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
# Ensure the element is clickable
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div.Divider.Divider--contents.Divider--largeMargin > div.Button__container')))
# Try to click the button using JavaScript if needed
driver.execute_script("arguments[0].click();", show_more_button)
# Optionally, wait for some condition after clicking, e.g., new content to load
WebDriverWait(driver, 10).until(EC.staleness_of(show_more_button))
except TimeoutException:
print("No more 'Show More' button found or timed out.")
break
except StaleElementReferenceException:
print("StaleElementReferenceException: Trying to find the button again.")
continue
except ElementClickInterceptedException:
print("ElementClickInterceptedException: Element is being obstructed.")
continue
except Exception as e:
print(f"Error clicking 'Show More' button: {e}")
break
# Remember to call your function
click_show_more()
After @x1337loser 's comment, I added an append to list section to the getdata() function and after which I started getting the following error: Link to new code error
I also removed user image from the print out and added a data dictionary and write to csv to the end of the script, but these steps do not appear to be causing any issues.
See new code below:
import time
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup
import pandas as pd
ReviewText = []
ratings = []
Reviewer = []
reviewdt = []
ReviewerId = []
Author_ID = []
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def get_data(content):
data = content['data']['getReviews']['edges']
for i in data:
id_user = i['node']['creator']['id']
rev_img = i['node']['creator']['imageUrlSquare']
is_author = i['node']['creator']['isAuthor']
follower_count = i['node']['creator']['followersCount']
name = i['node']['creator']['name']
review = BeautifulSoup(i['node']['text'], "lxml").text #removed HTML tag from text
review_create_data = i['node']['createdAt']
result_ms = pandas.to_datetime(review_create_data,unit='ms') #decode timestamp
review_liked = i['node']['likeCount']
rating = i['node']['rating']
#\nreviewer_image: {rev_img} between resultms and \nreviewer_follower
print(f"reviewers_name: {name}\nreviewer_user_id: {id_user}\nIs reviewr is author: {is_author}\nreview_date: {result_ms}\nreviewer_follower: {follower_count}\nreview: {review}\nreview_ratings: {rating}\nreview_liked: {review_liked}\n========================================")
#print(f"Is reviewr is author: {is_author}\nreviewer_follower: {follower_count}\nreview_liked: {review_liked}\n========================================")
ReviewText_content = {review}
rating_text = {rating}
Reviewer_name = {name}
reviewdt_content = {result_ms}
reviewer_id = {id_user}
author = {is_author}
# Append the Reviewer_text to the list of ratings
Reviewer.append(Reviewer_name)
reviewdt.append(reviewdt_content)
ratings.append(rating_text)
ReviewText.append(ReviewText_content)
ReviewerId.append(reviewer_id)
Author_ID.append(author)
def gatherNextPage(resourceId):
url = "https://kxbwmqov6jgg3daaamb744ycu4.appsync-api.us-east-1.amazonaws.com/graphql"
headers = {
"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0",
"X-Api-Key": "da2-xpgsdydkbregjhpr6ejzqdhuwy" # The Server added this API KEY automatically and strict checking happened from the client-server request
}
data = {
"operationName": "getReviews",
"variables": {
"filters": {
"resourceType": "WORK",
"resourceId": f"{resourceId}"
},
"pagination": {
"limit": 100
}
},
"query": "query getReviews($filters: BookReviewsFilterInput!, $pagination: PaginationInput) {\n getReviews(filters: $filters, pagination: $pagination) {\n ...BookReviewsFragment\n __typename\n }\n}\n\nfragment BookReviewsFragment on BookReviewsConnection {\n totalCount\n edges {\n node {\n ...ReviewCardFragment\n __typename\n }\n __typename\n }\n pageInfo {\n prevPageToken\n nextPageToken\n __typename\n }\n __typename\n}\n\nfragment ReviewCardFragment on Review {\n __typename\n id\n creator {\n ...ReviewerProfileFragment\n __typename\n }\n recommendFor\n updatedAt\n createdAt\n spoilerStatus\n lastRevisionAt\n text\n rating\n shelving {\n shelf {\n name\n webUrl\n __typename\n }\n taggings {\n tag {\n name\n webUrl\n __typename\n }\n __typename\n }\n webUrl\n __typename\n }\n likeCount\n viewerHasLiked\n commentCount\n}\n\nfragment ReviewerProfileFragment on User {\n id: legacyId\n imageUrlSquare\n isAuthor\n ...SocialUserFragment\n textReviewsCount\n viewerRelationshipStatus {\n isBlockedByViewer\n __typename\n }\n name\n webUrl\n contributor {\n id\n works {\n totalCount\n __typename\n }\n __typename\n }\n __typename\n}\n\nfragment SocialUserFragment on User {\n viewerRelationshipStatus {\n isFollowing\n isFriend\n __typename\n }\n followersCount\n __typename\n}\n"
}
n, data_next = 1, data
while True:
time.sleep(3)
resp = requests.post(url, headers=headers, json=data_next, verify=False)
data = resp.json()
get_data(data)
nextPageToken = data['data']['getReviews']['pageInfo']['nextPageToken']
if not nextPageToken:
break
data_next = {
"operationName": "getReviews",
"variables": {
"filters": {
"resourceType": "WORK",
"resourceId": f"{resourceId}"
},
"pagination": {
"after": f"{nextPageToken}", #nextPageToken from response data, and the total limit is 100 results per request.
"limit": 100
}
},
"query": "query getReviews($filters: BookReviewsFilterInput!, $pagination: PaginationInput) {\n getReviews(filters: $filters, pagination: $pagination) {\n ...BookReviewsFragment\n __typename\n }\n}\n\nfragment BookReviewsFragment on BookReviewsConnection {\n totalCount\n edges {\n node {\n ...ReviewCardFragment\n __typename\n }\n __typename\n }\n pageInfo {\n prevPageToken\n nextPageToken\n __typename\n }\n __typename\n}\n\nfragment ReviewCardFragment on Review {\n __typename\n id\n creator {\n ...ReviewerProfileFragment\n __typename\n }\n recommendFor\n updatedAt\n createdAt\n spoilerStatus\n lastRevisionAt\n text\n rating\n shelving {\n shelf {\n name\n webUrl\n __typename\n }\n taggings {\n tag {\n name\n webUrl\n __typename\n }\n __typename\n }\n webUrl\n __typename\n }\n likeCount\n viewerHasLiked\n commentCount\n}\n\nfragment ReviewerProfileFragment on User {\n id: legacyId\n imageUrlSquare\n isAuthor\n ...SocialUserFragment\n textReviewsCount\n viewerRelationshipStatus {\n isBlockedByViewer\n __typename\n }\n name\n webUrl\n contributor {\n id\n works {\n totalCount\n __typename\n }\n __typename\n }\n __typename\n}\n\nfragment SocialUserFragment on User {\n viewerRelationshipStatus {\n isFollowing\n isFriend\n __typename\n }\n followersCount\n __typename\n}\n"
}
gatherNextPage("kca://work/amzn1.gr.work.v3.JeHZlXvg2e1mD9_k")
data_dict = {
'ReviewDate': reviewdt,
'ReviewerName': Reviewer,
'Rating': ratings,
'ReviewText': ReviewText,
'ReviewerID': ReviewerId,
'IsAuthor': Author_ID
#'ReviewerMeta': reviewer_meta
}
df = pd.DataFrame(data_dict)
df. to_csv('McCurdy_Sample_Sept30.csv', index=False)
Note: This answer contains a different method to reach your goal. (Module used: requests, bs4, pandas, time)
Hi there,
Based on your question I think you are trying to fetch all the information related to review data. well, I found out there is a better way of solution than using selenium, here is my mindmap:-
Your target app has a Graphql API endpoint https://kxbwmqov6jgg3daaamb744ycu4.appsync-api.us-east-1.amazonaws.com/graphql
which is fetching all the review details from the server, so if we send a request to specifying a resourseId
on POST body to this endpoint we can easily get those data with the help of python request library and a little bit of coding. here is my code:
Note: to avoid rate limiting I used time.sleep(3) to minimize the thread.
import time
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup
import pandas
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def get_data(content):
data = content['data']['getReviews']['edges']
for i in data:
id_user = i['node']['creator']['id']
rev_img = i['node']['creator']['imageUrlSquare']
is_author = i['node']['creator']['isAuthor']
follower_count = i['node']['creator']['followersCount']
name = i['node']['creator']['name']
review = BeautifulSoup(i['node']['text'], "lxml").text #removed HTML tag from text
review_create_data = i['node']['createdAt']
result_ms = pandas.to_datetime(review_create_data,unit='ms') #decode timestamp
review_liked = i['node']['likeCount']
rating = i['node']['rating']
print(f"reviewers_name: {name}\nreviewer_user_id: {id_user}\nIs reviewr is author: {is_author}\nreview_date: {result_ms}\nreviewer_image: {rev_img}\nreviewer_follower: {follower_count}\nreview: {review}\nreview_ratings: {rating}\nreview_liked: {review_liked}\n========================================")
def gatherNextPage(resourceId):
url = "https://kxbwmqov6jgg3daaamb744ycu4.appsync-api.us-east-1.amazonaws.com/graphql"
headers = {
"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0",
"X-Api-Key": "da2-xpgsdydkbregjhpr6ejzqdhuwy" # The Server added this API KEY automatically and strict checking happened from the client-server request
}
data = {
"operationName": "getReviews",
"variables": {
"filters": {
"resourceType": "WORK",
"resourceId": f"{resourceId}"
},
"pagination": {
"limit": 100
}
},
"query": "query getReviews($filters: BookReviewsFilterInput!, $pagination: PaginationInput) {\n getReviews(filters: $filters, pagination: $pagination) {\n ...BookReviewsFragment\n __typename\n }\n}\n\nfragment BookReviewsFragment on BookReviewsConnection {\n totalCount\n edges {\n node {\n ...ReviewCardFragment\n __typename\n }\n __typename\n }\n pageInfo {\n prevPageToken\n nextPageToken\n __typename\n }\n __typename\n}\n\nfragment ReviewCardFragment on Review {\n __typename\n id\n creator {\n ...ReviewerProfileFragment\n __typename\n }\n recommendFor\n updatedAt\n createdAt\n spoilerStatus\n lastRevisionAt\n text\n rating\n shelving {\n shelf {\n name\n webUrl\n __typename\n }\n taggings {\n tag {\n name\n webUrl\n __typename\n }\n __typename\n }\n webUrl\n __typename\n }\n likeCount\n viewerHasLiked\n commentCount\n}\n\nfragment ReviewerProfileFragment on User {\n id: legacyId\n imageUrlSquare\n isAuthor\n ...SocialUserFragment\n textReviewsCount\n viewerRelationshipStatus {\n isBlockedByViewer\n __typename\n }\n name\n webUrl\n contributor {\n id\n works {\n totalCount\n __typename\n }\n __typename\n }\n __typename\n}\n\nfragment SocialUserFragment on User {\n viewerRelationshipStatus {\n isFollowing\n isFriend\n __typename\n }\n followersCount\n __typename\n}\n"
}
n, data_next = 1, data
while True:
time.sleep(3)
resp = requests.post(url, headers=headers, json=data_next, verify=False)
data = resp.json()
get_data(data)
nextPageToken = data['data']['getReviews']['pageInfo']['nextPageToken']
if not nextPageToken:
break
data_next = {
"operationName": "getReviews",
"variables": {
"filters": {
"resourceType": "WORK",
"resourceId": f"{resourceId}"
},
"pagination": {
"after": f"{nextPageToken}", #nextPageToken from response data, and the total limit is 100 results per request.
"limit": 100
}
},
"query": "query getReviews($filters: BookReviewsFilterInput!, $pagination: PaginationInput) {\n getReviews(filters: $filters, pagination: $pagination) {\n ...BookReviewsFragment\n __typename\n }\n}\n\nfragment BookReviewsFragment on BookReviewsConnection {\n totalCount\n edges {\n node {\n ...ReviewCardFragment\n __typename\n }\n __typename\n }\n pageInfo {\n prevPageToken\n nextPageToken\n __typename\n }\n __typename\n}\n\nfragment ReviewCardFragment on Review {\n __typename\n id\n creator {\n ...ReviewerProfileFragment\n __typename\n }\n recommendFor\n updatedAt\n createdAt\n spoilerStatus\n lastRevisionAt\n text\n rating\n shelving {\n shelf {\n name\n webUrl\n __typename\n }\n taggings {\n tag {\n name\n webUrl\n __typename\n }\n __typename\n }\n webUrl\n __typename\n }\n likeCount\n viewerHasLiked\n commentCount\n}\n\nfragment ReviewerProfileFragment on User {\n id: legacyId\n imageUrlSquare\n isAuthor\n ...SocialUserFragment\n textReviewsCount\n viewerRelationshipStatus {\n isBlockedByViewer\n __typename\n }\n name\n webUrl\n contributor {\n id\n works {\n totalCount\n __typename\n }\n __typename\n }\n __typename\n}\n\nfragment SocialUserFragment on User {\n viewerRelationshipStatus {\n isFollowing\n isFriend\n __typename\n }\n followersCount\n __typename\n}\n"
}
gatherNextPage("kca://work/amzn1.gr.work.v3.JeHZlXvg2e1mD9_k")
resourceId
:well, resourceId
is actually workId
from your URL reviewFilters={%22workId%22:%22kca://work/amzn1.gr.work.v3.JeHZlXvg2e1mD9_k%22,%22after%22:%22MjYwMTYsMTY2MDc1MjY5MjY2Mw%22}
I hope this will help.
Thanks
Hi after analyzing the requests with the proxy tool I found out that some creator's objects don't have user_Id
on them (I don't know why) that's why the line: 19
shows the TypeError: 'NoneType' object is not subscriptable
exception, well this can be fixed by adding exception handler in get_data()
function like this:
def get_data(content):
data = content['data']['getReviews']['edges']
for i in data:
try: #Exception happend because of blank userId Like this, eg: '{Timestamp('2023-01-12 19:33:29.111000')},{'0_0 stephen'},{0},{'006'},,{False}'
id_user = i['node']['creator']['id']
rev_img = i['node']['creator']['imageUrlSquare']
is_author = i['node']['creator']['isAuthor']
follower_count = i['node']['creator']['followersCount']
name = i['node']['creator']['name']
review = BeautifulSoup(i['node']['text'], "lxml").text #removed HTML tag from text
review_create_data = i['node']['createdAt']
result_ms = pd.to_datetime(review_create_data,unit='ms') #decode timestamp
review_liked = i['node']['likeCount']
rating = i['node']['rating']
#\nreviewer_image: {rev_img} between resultms and \nreviewer_follower
print(f"reviewers_name: {name}\nreviewer_user_id: {id_user}\nIs reviewr is author: {is_author}\nreview_date: {result_ms}\nreviewer_follower: {follower_count}\nreview: {review}\nreview_ratings: {rating}\nreview_liked: {review_liked}\n========================================")
#print(f"Is reviewr is author: {is_author}\nreviewer_follower: {follower_count}\nreview_liked: {review_liked}\n========================================")
ReviewText_content = {review}
rating_text = {rating}
Reviewer_name = {name}
reviewdt_content = {result_ms}
reviewer_id = {id_user}
author = {is_author}
# Append the Reviewer_text to the list of ratings
Reviewer.append(Reviewer_name)
reviewdt.append(reviewdt_content)
ratings.append(rating_text)
ReviewText.append(ReviewText_content)
ReviewerId.append(reviewer_id)
Author_ID.append(author)
except Exception:
pass
Let me know if I missed something regarding the error you've faced!
lang detection using langdetect module pip3 install landetect
from langdetect import detect
ReviewText = []
ratings = []
Reviewer = []
reviewdt = []
ReviewerId = []
Author_ID = []
Review_language = []
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def get_data(content):
data = content['data']['getReviews']['edges']
for i in data:
try: #Exception happend because of blank userId Like this, eg: '{Timestamp('2023-01-12 19:33:29.111000')},{'0_0 stephen'},{0},{'006'},,{False}'
id_user = i['node']['creator']['id']
rev_img = i['node']['creator']['imageUrlSquare']
is_author = i['node']['creator']['isAuthor']
follower_count = i['node']['creator']['followersCount']
name = i['node']['creator']['name']
review = BeautifulSoup(i['node']['text'], "lxml").text #removed HTML tag from text
review_create_data = i['node']['createdAt']
result_ms = pd.to_datetime(review_create_data,unit='ms') #decode timestamp
review_liked = i['node']['likeCount']
rating = i['node']['rating']
review_lang = detect(review) #\nreviewer_image: {rev_img} between resultms and \nreviewer_follower
print(f"reviewers_name: {name}\nreviewer_user_id: {id_user}\nIs reviewr is author: {is_author}\nreview_date: {result_ms}\nreviewer_follower: {follower_count}\nreview: {review}\nreview_language: {review_lang}\nreview_ratings: {rating}\nreview_liked: {review_liked}\n========================================")
#print(f"Is reviewr is author: {is_author}\nreviewer_follower: {follower_count}\nreview_liked: {review_liked}\n========================================")
ReviewText_content = {review}
rating_text = {rating}
Reviewer_name = {name}
reviewdt_content = {result_ms}
reviewer_id = {id_user}
author = {is_author}
lang_detect = {review_lang}
# Append the Reviewer_text to the list of ratings
Reviewer.append(Reviewer_name)
reviewdt.append(reviewdt_content)
ratings.append(rating_text)
ReviewText.append(ReviewText_content)
ReviewerId.append(reviewer_id)
Author_ID.append(author)
Review_language.append(lang_detect)
except Exception:
pass
Thanks