Issue Description:
I am trying to automate a process where I can visit a website and scrape product details of top 100 products on that page and put it in an excel file.
Code Explanation:
I have a class Webscraper inside which I am calling two functions. First I am calling scroll_and_click_view_more function which is simply scrolling down the webpage that I am visiting. Then I am calling prod_vitals function which is extracting product code and product names from that webpage.
Error Description:
Whenever I am running below code upto a certain maximum no. of products, the code gets stuck after a point and throws Index out of range error. If I set max_count_of_products=50, code got stuck at line, If I set max_count_of_products=100, code got stuck at 93. There is no fixed index where I am getting stuck, if I change the value of max_count_of_products, the point at which the code gets stuck is also changing.
I am attaching screenshots of the error below.
max_count_of_products=50
max_count_of_products=100
Please find my code below:
products_summary = []
max_count_of_products=100
def scroll_and_click_view_more(driver,href):
flag=False
last_height = driver.execute_script("return window.pageYOffset + window.innerHeight")
while True:
try:
driver.execute_script("window.scrollBy(0, 800);")
time.sleep(4)
new_height = driver.execute_script("return window.pageYOffset + window.innerHeight")
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.product-tile')))
except Exception as e:
if new_height == last_height and flag==False:
print("Reached the end of the page and no product tiles were found: ",href)
return "No product tiles found"
else:
last_height = new_height
continue
div_count = 0
flag=True
response = driver.page_source
soup = BeautifulSoup(response, 'html.parser')
div_elements = soup.find_all('div', class_ = 'product-tile')
div_count = len(div_elements)
if(div_count > max_count_of_products):
return(driver.page_source)
else:
driver.execute_script("window.scrollBy(0, 300);")
time.sleep(3)
new_height = driver.execute_script("return window.pageYOffset + window.innerHeight")
#print(new_height)
if new_height == last_height:
return(driver.page_source)
else:
last_height = new_height
except Exception as e:
print(e)
break
def prod_vitals(soup,title,url):
count_of_items=1
products_data = [] # Array to store all product data for our excel sheet
for div in soup.find_all('div', class_ = 'product-tile'): # Iterate over each individual product-tile div tag
if count_of_items<=max_count_of_products:
count_of_items = count_of_items+1;
pro_code = div.select('div.css-1fg6eq7 img')[0]['id']
pro_name = div.select('div.product-name a.css-avqw6d p.css-1d5mpur')[0].get_text()
products_data.append({'Product Code': pro_code, 'Product Name': pro_name}) # Append the extracted data to the list
print("Count: ", count_of_items)
print("Product Code: ",pro_code)
print("Product Name: ",pro_name)
print("\n")
else:
break
time.sleep(5)
class WebScraper:
def __init__(self):
self.url = "https://staging1-japan.coach.com/shop/new/women/?auto=true"
options = Options()
options.add_argument("--remote-debugging-port=9222")
self.driver = webdriver.Chrome(service=Service(r"c:\Users\DELL\Documents\Self_Project\chromedriver.exe"), options=options)
def scrape(self):
self.driver.get(self.url)
time.sleep(5)
soup = BeautifulSoup(self.driver.page_source, 'html.parser') # Refresh the page source and parse it
response = scroll_and_click_view_more(self.driver, 'Link')
time.sleep(3)
if response != "No product tiles found" and response != "Reached the end of the page.":
soup = BeautifulSoup(response, 'html.parser')
prod_vitals(soup,'TITLE', self.url)
time.sleep(2)
else:
self.driver.execute_script("window.scrollTo(0,0);")
time.sleep(3)
self.driver.close()
scraper = WebScraper()
scraper.scrape()
time.sleep(5)
scraper.driver.quit()
> Attaching the product div structure below:
<div class="css-0">
<div class="css-1fg6eq7">
<div tabindex="-1" style="padding-top: 125%;"></div>
<img width="237.01" height="296" class="chakra-image css-14ql1gk" src="https://coach.scene7.com/is/image/Coach/cn731_b4ous_a0?$desktopProductTile$" fetchpriority="high" id="CN731 B4OUS" name="タビー 12" data-qa="cm_tile_link_pt_img" contain="none" alt="COACH®,タビー 12,ボディバッグ&斜めがけバッグ,トゥルー ピンク">
</div>
</div>
**Please find my code below with API method:**
def prod_vitals(title,url):
count_of_items=1
products_data = [] # Array to store all product data for our excel sheet
page = 0
list_price = 0 # Variable to store list price
sale_price = 0 # Variable to store sale price
discount1 = 0 # Variable to store discount% that is displayed on the site
discount2 = 0 # Variable to store discount% calculated manually
res = "Incorrect"
if '/shop' not in url:
print("Not in url",url)
return "No product tiles found"
try:
while True:
page = page + 1 #page result start from 2 and each page contains 16 result
ct = count_of_items + 15 #count to adjust loop as each page contains 16 results
if(page==1):
full_url = f"{url}"
else:
full_url = f"{url}?page={page}"
if 'api/get-shop' not in full_url:
full_url = full_url.replace('/shop', '/api/get-shop')
session = requests.Session()
response = session.get(full_url, headers=headers, verify=False)
products = response.json().get('pageData', {}).get('products', [])
#products = len(response.json()['pageData']['products'])
print(full_url,"\n", len(products),"\n")
if '/get-shop' in full_url and not products:
print("No product tiles found",full_url)
return "No product tiles found"
#print(response.json()['pageData']['products'],"\n")
for i in products:
print(i['defaultVariant']['prices'],"\n")
pro_code = i['defaultColor']['vgId']
#print(pro_code)
pro_name = i['name']
#print(pro_name)
pdpurl = i['defaultColor']['url']
#print(pdpurl,"\n")
sale_price = i['defaultVariant']['prices']['currentPrice'] if i['defaultVariant']['prices']['currentPrice'] else 0
list_price = i['defaultVariant']['prices']['regularPrice'] if 'regularPrice' in i['defaultVariant']['prices'] and i['defaultVariant']['prices']['regularPrice'] else 0
discount1 = i['defaultVariant']['prices']['discount'] if i['defaultVariant']['prices']['discount'] and i['defaultVariant']['prices']['discount'] is not None else 0
if list_price > 0 and sale_price > 0:
discount2 = round(((list_price - sale_price) / list_price) * 100)
res = "Correct" if discount1 == discount2 else "Incorrect"
elif(list_price ==0 and discount1 == 0):
discount2 = 0
res = "Correct"
translator = Translator()
translated_pro_name = translator.translate(pro_name, dest='en').text
if count_of_items <= max_count_of_products: # condition to check if the count reach 100 or not, if count is less than 100 print, else stop printing
#print(f'Product Count: {count_of_items}\nProduct Code: {pro_code}\nProduct Name: {translated_pro_name}\nProduct URL: {url+pdpurl}\nSale Price: {sale_price}\nList Price: {list_price}\nDiscount1: {discount1}\nDiscount2: {discount2}\nResult: {res}\n\n')
products_data.append({'Product Code': pro_code, 'Product Name': translated_pro_name,'Product URL': url+pdpurl, 'Sale Price': '¥'+format(sale_price, '.2f'), 'List Price': '¥'+format(list_price, '.2f'), 'Discount on site': str(discount1)+'%', 'Actual Discount': str(discount2)+'%', 'Result': res})
count_of_items = count_of_items + 1
#continue
else:
break
if ct > max_count_of_products: #if ct > count thats mean we reach our goal so break the loop
break
except Exception as e:
print(e)
pass
this was an exception made by your Python script because it didn't find the id
value for some products (maybe a different variant of product ID/code was shown that didn't belong to div.css-1fg6eq7
well this will fix if you add exception handler, here is the modified version of code in prod_vitals
function:
def prod_vitals(soup,title,url):
count_of_items=0
products_data = []
try: #added error handler to avoid any disruption while the function is running # Array to store all product data for our excel sheet
for div in soup.find_all('div', class_ = 'product-tile'): # Iterate over each individual product-tile div tag
if count_of_items<=max_count_of_products:
#print(title)
list_price = 0 # Variable to store list price
sale_price = 0 # Variable to store sale price
discount1 = 0 # Variable to store discount% that is displayed on the site
discount2 = 0
count_of_items = count_of_items+1; # Variable to store discount% calculated manually
res = "Incorrect" # Variable to store result of discount1==discount2; initialized with Incorrect
pro_code = div.select('div.css-1fg6eq7 img')[0]['id']
pro_name = div.select('div.product-name a.css-avqw6d p.css-1d5mpur')[0].get_text()
products_data.append({'Product Code': pro_code, 'Product Name': pro_name}) # Append the extracted data to the list
print("Count: ", count_of_items)
print("Product Code: ",pro_code)
print("Product Name: ",pro_name)
print("\n")
else:
break
except Exception:
pass
time.sleep(3)
All of the info that you're targeting is fetched from this API endpoint https://staging1-japan.coach.com/api/get-shop/new/women?page=2
with JSON response, so we can gather this information faster than selenium, with a couple of lines of code using requests
module.
Here is the code:
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning) #disable urllib warning
count = 0
headers = {
"Cookie":"auth-bypass=true;" #this cookie value is mandatory for this apps otherwise its throwing '401 Unauthorized' error
}
page = 1
while True:
page = page + 1 #page result start from 2 and each page contains 16 result
ct = count + 15 #count to adjust loop as each page contains 16 results
url = f"https://staging1-japan.coach.com/api/get-shop/new/women?page={page}"
session = requests.Session()
response = session.get(url, headers=headers, verify=False)
for i in response.json()['pageData']['products']:
code = i['productId']
name = i['name']
count = count + 1
if count < 101: # condition to check if the count reach 100 or not, if count is less than 100 print, else stop printing
print(f'Product Count: {count}\nProduct Code: {code}\nProduct Name: {name}\n\n')
if ct > 100: #if ct > count thats mean we reach our goal so break the loop
break