Learning web scraping in Python using Selenium. I want to scrape the prices and names of the goods from Amazon and store them in a list. I'm doing it using while loop until it is impossible to click to the next page an thus it will throw TimeException error. When I debug I can clearly see that everything works fine, my lists get longer and longer but then when it breaks and I print the lists, I see that my program saved only the first loop iteration in there. Don't really understand what is going on there. Here is my code:
from selenium.webdriver.common.by import By
from time import sleep
# paste url that you want to scrape
url = "https://www.amazon.se/-/en/s?k=mirror+sticker&language=en_GB&crid=3LCT7C6GU8FUS&qid=1656847509&sprefix=mirror+sticker%2Caps%2C91&ref=sr_pg_1"
# this will open up new window with the url provided above
# put the path to the driver.exe file in the brackets
driver = webdriver.Chrome("chromedriver.exe")
sleep(3) # wait 3 seconds
driver.find_element(By.ID, "sp-cc-accept").click() # cookies
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
def get_text_store(web_elements_lst, storage_lst): # text (names and prices) of webelements
for element in web_elements_lst:
if element.get_attribute("textContent") != "":
storage_lst.append(element.get_attribute("textContent")) # if not empty, append
storage_lst.append("No data") # if empty str
names_txt = [] # here I'll store str names
prices_txt = [] # here I store str prices
while True:
web_elements_names = driver.find_elements(By.CLASS_NAME,
"a-size-base-plus.a-color-base.a-text-normal") # names (webelems)
web_elements_prices = driver.find_elements(By.CLASS_NAME, "a-price-whole") # prices (webelems)
get_text_store(web_elements_names, names_txt) # text from webelems names
get_text_store(web_elements_prices, prices_txt) # text from webelems prices
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, "//a[text()='Next']"))).click() # go to the next page
except TimeoutException:
print("Timeout Exception")
Add a sleep as shown below
while True:
web_elements_names = driver.find_elements(By.CLASS_NAME,
"a-size-base-plus.a-color-base.a-text-normal") # names (webelems)
web_elements_prices = driver.find_elements(By.CLASS_NAME, "a-price-whole") # prices (webelems)
get_text_store(web_elements_names, names_txt) # text from webelems names
get_text_store(web_elements_prices, prices_txt) # text from webelems prices
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, "//a[text()='Next']"))).click() # go to the next page
except TimeoutException:
print("Timeout Exception")