I need to filter out all webpages with 0 listings on Grailed. I have over 500k URLs to go through. I'm using Python and Selenium. My problem is that for every new webpage the script needs to click on the cookie and user login pop-up to access the number of listings. The result is that each webpage takes ~13 seconds to process. For 500k URLs this will take 75 days.
An example link: https://www.grailed.com/designers/acne-studios/casual-pants
All 500k links: https://www.grailed.com/designers/designer-name/category-name
Two approaches I'm figuring out:
Try to block the cookie and user login pop-ups. However I'm not sure if this is possible without saving some sort of user profile, after which I'm worried I'll get blocked by Grailed.
Run multiple instances at the same time, preferably between 13 (~2 weeks) and 130 (~14 hours). However I'm not sure if it'll be costly and how to avoid getting blocked. do I need to use proxies for this?
Please tell me if I'm missing something. My code:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
import os
import time
# Update the PATH environment variable
os.environ['PATH'] += r";C:\Users\rafme\Desktop\Selenium Drivers"
# Read the CSV file
BrandCategoryLinks = pd.read_csv('C:/Users/rafme/Downloads/Test Brands & Categories.csv')
FilteredCategoryLink = []
# Loop through each link in the DataFrame
for index, link in BrandCategoryLinks.iterrows():
driver = None
try:
base_url = link['Links']
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-gpu") # Disable GPU usage
chrome_options.add_argument("--no-sandbox") # Disable sandboxing
chrome_options.add_argument("--disable-dev-shm-usage") # Disable shared memory usage
chrome_options.add_argument("--window-size=1920x1080") # Set the window size
chrome_options.add_argument("--headless")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36")
service = Service(r"C:\Users\rafme\Desktop\Selenium Drivers\chromedriver.exe")
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.get(base_url)
timeout = 60 # Increase timeout
try:
WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.ID, "onetrust-reject-all-handler")))
reject_button = driver.find_element(By.ID, "onetrust-reject-all-handler")
# Scroll the element into view using JavaScript
driver.execute_script("arguments[0].scrollIntoView(true);", reject_button)
time.sleep(2) # Wait for the scrolling to complete
# Click the element
reject_button.click()
time.sleep(1)
reject_button.click()
time.sleep(1)
except (NoSuchElementException, ElementClickInterceptedException):
pass
except Exception as e:
print(f"Error occurred: {e}")
continue
# Close the user login modal if it exists
try:
elem = driver.find_element(By.XPATH, "//div[@class='Modal-Content']")
ac = ActionChains(driver)
ac.move_to_element(elem).move_by_offset(250, 0).click().perform() # clicking away from login window
except NoSuchElementException:
pass
except Exception as e:
print(f"Error clicking 'User Authentication' button: {e}")
continue
# Check listing count
try:
listing_count = driver.find_elements(By.XPATH,
"//div[@class='FiltersInstantSearch']//div[@class='feed-item']")
if len(listing_count) > 1:
print(f"Found {len(listing_count)} listings on {base_url}")
FilteredCategoryLink.append(base_url)
else:
print(f"Found {len(listing_count)} listings on {base_url}, not enough to keep.")
except Exception as e:
print(f"Error finding listings: {e}")
continue
except Exception as e:
print(f"Error processing link {link}: {e}")
finally:
if driver:
driver.quit()
# Save the filtered categories to CSV
filtered_categories = pd.DataFrame(FilteredCategoryLink, columns=['Link'])
filtered_categories.to_csv('filtered_categories.csv', index=False)
As suggested in the comments, it's better to pull the data via API using Python's requests
library.
The website currently around 12k designers and 128 subcategories, which would result in up to 1.5M data points. Here are the 3 steps to significantly speed it up:
requests
, it takes just ~0.3s per request, which is an additional 43x improvement compared to 13 seconds.Putting these things together resulted in ~180,000x speed improvement from your original implementation. In other words, it takes slightly over one minute to pull all the data.
And if it's still not enough, you could add proxies as step #4. Hope this provides some useful insights.
import requests
import json
from urllib.parse import quote
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'X-Algolia-Api-Key': 'bc9ee1c014521ccf312525a4ef324a16',
'X-Algolia-Application-Id': 'MNRWEFSS2Q'}
url_designers = 'https://www.grailed.com/api/designers'
req_designers = requests.get(url_designers, headers=headers)
designers = json.loads(req_designers.text)['data']
url_api = 'https://mnrwefss2q-dsn.algolia.net/1/indexes/*/queries'
data = []
for des in designers:
facetFilters = quote(f'[["designers.name:{des['name']}"]]')
facets = quote('["category_path"]')
payload = '{"requests":[{"indexName": "Listing_by_low_price_production", "params": "maxValuesPerFacet=200&hitsPerPage=0&facetFilters=%s&facets=%s"}]}' % (facetFilters, facets)
req = requests.post(url_api, headers=headers, data=payload)
listings = json.loads(req.text)['results'][0]['facets']
if 'category_path' in listings:
data.append({des['name']: listings['category_path']})
else:
data.append({des['name']: {}})
data
output looks like this:
[
...
{'Acne Studios': {'bottoms.denim': 4644, 'tops.sweaters_knitwear': 2266, 'tops.sweatshirts_hoodies': 1658, 'womens_bottoms.jeans': 1122, 'tops.short_sleeve_shirts': 1087, 'bottoms.casual_pants': 1078, 'tops.button_ups': 960, 'outerwear.light_jackets': 591, 'womens_tops.sweaters': 557, 'footwear.lowtop_sneakers': 331, 'tops.long_sleeve_shirts': 295, 'outerwear.heavy_coats': 289, 'bottoms.shorts': 288, 'outerwear.denim_jackets': 279, 'outerwear.bombers': 257, 'womens_bottoms.pants': 211, 'outerwear.leather_jackets': 207, 'accessories.hats': 188, 'womens_tops.sweatshirts': 160, 'womens_tops.short_sleeve_shirts': 159, 'tailoring.blazers': 142, 'womens_footwear.boots': 140, 'outerwear.parkas': 122, 'womens_dresses.midi': 116, 'bottoms.sweatpants_joggers': 108, 'tops.polos': 107, 'accessories.gloves_scarves': 102, 'footwear.hitop_sneakers': 94, 'womens_outerwear.jackets': 91, 'womens_tops.blouses': 90, 'womens_outerwear.coats': 86, 'footwear.boots': 83, 'womens_tops.button_ups': 80, 'bottoms.cropped_pants': 76, 'tops.sleeveless': 74, 'womens_dresses.mini': 65, 'womens_footwear.lowtop_sneakers': 65, 'accessories.bags_luggage': 64, 'womens_tops.long_sleeve_shirts': 64, 'womens_outerwear.denim_jackets': 60, 'accessories.sunglasses': 57, 'womens_outerwear.blazers': 55, 'footwear.leather': 53, 'womens_accessories.scarves': 53, 'womens_outerwear.leather_jackets': 52, 'womens_bottoms.mini_skirts': 47, 'womens_bottoms.midi_skirts': 44, 'tailoring.suits': 41, 'womens_dresses.maxi': 41, 'womens_accessories.hats': 40, 'womens_tops.hoodies': 40, 'womens_tops.tank_tops': 38, 'womens_bottoms.shorts': 37, 'outerwear.vests': 35, 'womens_outerwear.bombers': 31, 'footwear.formal_shoes': 29, 'womens_footwear.heels': 29, 'accessories.jewelry_watches': 25, 'tailoring.formal_trousers': 24, 'womens_tops.crop_tops': 22, 'womens_tops.polos': 22, 'outerwear.raincoats': 19, 'womens_outerwear.down_jackets': 18, 'outerwear.cloaks_capes': 17, 'womens_accessories.miscellaneous': 17, 'womens_bags_luggage.shoulder_bags': 17, 'accessories.misc': 16, 'accessories.wallets': 16, 'footwear.slip_ons': 15, 'womens_footwear.sandals': 14, 'womens_accessories.sunglasses': 13, 'womens_bags_luggage.tote_bags': 12, 'womens_bottoms.joggers': 12, 'accessories.belts': 11, 'accessories.glasses': 11, 'womens_footwear.flats': 11, 'footwear.sandals': 10, 'tops.jerseys': 10, 'womens_footwear.hitop_sneakers': 10, 'womens_footwear.platforms': 9, 'womens_bottoms.leggings': 8, 'womens_bottoms.maxi_skirts': 8, 'accessories.socks_underwear': 7, 'bottoms.swimwear': 7, 'womens_accessories.belts': 7, 'womens_outerwear.vests': 7, 'bottoms.jumpsuits': 6, 'womens_footwear.slip_ons': 6, 'womens_bags_luggage.crossbody_bags': 5, 'womens_bottoms.sweatpants': 5, 'tailoring.vests': 4, 'womens_accessories.socks_intimates': 4, 'womens_accessories.wallets': 4, 'womens_bags_luggage.handle_bags': 4, 'womens_dresses.gowns': 4, 'accessories.periodicals': 3, 'accessories.ties_pocketsquares': 3, 'bottoms.leggings': 3, 'tailoring.formal_shirting': 3, 'womens_bags_luggage.clutches': 3, 'womens_bags_luggage.mini_bags': 3, 'womens_bags_luggage.other': 3, 'womens_jewelry.necklaces': 3, 'womens_outerwear.fur_faux_fur': 3, 'bottoms': 2, 'womens_bags_luggage.backpacks': 2, 'womens_bags_luggage.bucket_bags': 2, 'womens_bottoms.jumpsuits': 2, 'womens_footwear.mules': 2, 'womens_jewelry.bracelets': 2, 'womens_jewelry.earrings': 2, 'tailoring.tuxedos': 1, 'womens_accessories.glasses': 1, 'womens_accessories.hair_accessories': 1, 'womens_jewelry.body_jewelry': 1, 'womens_jewelry.rings': 1, 'womens_outerwear.rain_jackets': 1, 'womens_tops.bodysuits': 1}},
{'A.Coba.Lt': {'footwear.boots': 1, 'tops.sweatshirts_hoodies': 1}},
{'A Cold Wall': {'tops.short_sleeve_shirts': 293, 'tops.sweatshirts_hoodies': 280, 'footwear.lowtop_sneakers': 187, 'bottoms.sweatpants_joggers': 183, 'outerwear.light_jackets': 148, 'tops.long_sleeve_shirts': 133, 'accessories.bags_luggage': 129, 'bottoms.casual_pants': 108, 'tops.sweaters_knitwear': 71, 'accessories.hats': 61, 'outerwear.vests': 61, 'footwear.boots': 56, 'bottoms.shorts': 54, 'footwear.hitop_sneakers': 52, 'outerwear.heavy_coats': 48, 'tops.button_ups': 47, 'outerwear.raincoats': 30, 'accessories.belts': 24, 'bottoms.denim': 21, 'accessories.misc': 18, 'outerwear.denim_jackets': 18, 'outerwear.parkas': 17, 'outerwear.bombers': 14, 'accessories.gloves_scarves': 13, 'footwear.leather': 11, 'tops.polos': 11, 'footwear.slip_ons': 10, 'womens_bottoms.midi_skirts': 10, 'accessories.jewelry_watches': 8, 'accessories.sunglasses': 7, 'accessories.socks_underwear': 6, 'accessories.wallets': 6, 'tops.sleeveless': 6, 'bottoms.cropped_pants': 5, 'footwear.sandals': 5, 'bottoms.leggings': 4, 'outerwear.cloaks_capes': 4, 'tops.jerseys': 4, 'tailoring.blazers': 3, 'womens_bottoms.jeans': 3, 'womens_footwear.boots': 3, 'womens_footwear.hitop_sneakers': 3, 'accessories.periodicals': 2, 'footwear.formal_shoes': 2, 'womens_bottoms.shorts': 2, 'womens_outerwear.rain_jackets': 2, 'womens_tops.sweaters': 2, 'accessories.glasses': 1, 'bottoms.jumpsuits': 1, 'bottoms.swimwear': 1, 'tailoring.suits': 1, 'womens_bottoms.leggings': 1, 'womens_outerwear.vests': 1, 'womens_tops.button_ups': 1}}
...
]