pythonweb-scrapingscrapy

How do you scrape/download all of the product images from eBay using Python?


I'm only able to scrape the URL of one full-resolution image from the eBay site; I'm unable to capture the URLs of all other images. I'm looking for a script that scrapes or downloads all of the images.

I want high-resolution photographs, not thumbnails, to download.

from lxml import html  
import requests
from bs4 import BeautifulSoup
import pandas as pd
main_url= 'https://www.ebay.com/'
headers= {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
url= 'https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2334524.m570.l1313&_nkw=laptop&_sacat=0&LH_TitleDesc=0&rt=nc&_odkw=toaster&_osacat=0&LH_PrefLoc=3&LH_All=1&_ipg=240'
r= requests.get(url, headers=headers)
print(r)
soup= BeautifulSoup(r.content, 'html.parser')

product_list= soup.find_all('div', class_= 's-item__image')

products_site = []

for item in product_list:
    for link in item.find_all('a', href= True):
        products_site.append(link['href'])
products_site = list(dict.fromkeys(products_site))
products_site = list(filter(None, products_site))        
products_site = [x for x in products_site if x.startswith('https://www.ebay.com/itm/')][:2]
print(len('product_site'))

item_list=[]
for link in products_site:
    r = requests.get(link, headers=headers)
    print(r)
    soup= BeautifulSoup(r.content, 'html.parser')
    Title= soup.select_one('h1', class_='x-item-title__mainTitle').get_text(strip=True)
    Image_URL= [x['src'] for x in soup.findAll('img', {'id': 'icImg'})]
    Product= {
        "Title": Title,        
        "Image_URL": Image_URL
    
        }

Solution

  • The URL of the images stays the same on eBay.

    To get all the images of a product in high resolution you can easily change the dimension of those different thumbnails and get HQ images.

    for example -

    https://i.ebayimg.com/images/g/pxcAAOSwis1hwW4V/s-l64.jpg
    

    the tailing s-l64 before .jpg denotes the resolution which is 64p you can change this to s-l100 / s-l300 or s-l500 to increase the resolution, the highest resolution it supports - s-l2000.

    So you can just replace the thumbnail's s-l64 with s-l2000 to get HQ images.

    Using this trick you don't need to click on the images to zoom in and get HQ images.

    Full working code -

    import requests
    from bs4 import BeautifulSoup
    
    main_url = 'https://www.ebay.com/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
    }
    url = 'https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2334524.m570.l1313&_nkw=laptop&_sacat=0&LH_TitleDesc=0&rt=nc&_odkw=toaster&_osacat=0&LH_PrefLoc=3&LH_All=1&_ipg=240'
    r = requests.get(url, headers=headers)
    print(r)
    soup = BeautifulSoup(r.content, 'html.parser')
    
    product_list = soup.find_all('div', class_='s-item__image')
    
    products_site = []
    
    for item in product_list:
        for link in item.find_all('a', href=True):
            products_site.append(link['href'])
    products_site = list(dict.fromkeys(products_site))
    products_site = list(filter(None, products_site))
    products_site = [x for x in products_site if x.startswith('https://www.ebay.com/itm/')][:2]
    print(len('product_site'))
    
    item_list = []
    for link in products_site:
        # print(link)
        r = requests.get(link, headers=headers)
        print(r)
        soup = BeautifulSoup(r.content, 'html.parser')
        Title = soup.select_one('h1', class_='x-item-title__mainTitle').get_text(strip=True)
    
        # example page - https://www.ebay.com/itm/125058259597?epid=4051542538&hash=item1d1e0d9a8d:g:pxcAAOSwis1hwW4V
    
        image_urls = [i.get('src').replace('s-l64', 's-l2000')
                      for i in soup.select('ul#vertical-align-items-viewport > li img')]
        if len(image_urls) == 0:
            # example page with no extra images
            # https://www.ebay.com/itm/125287169558?epid=19053326726&hash=item1d2bb27e16:g:sRAAAOSwKV9ia3Ia
    
            image_urls = set([x['src'] for x in soup.findAll('img', {'id': 'icImg'})])  # remove duplicate images
        product = {
            "Title": Title,
            "Image_URL": image_urls
    
        }
        print(product)
    
    
    

    Output -

    <Response [200]>
    12
    <Response [200]>
    {'Title': 'Lenovo Legion 5 Pro 16 165Hz QHD IPS G-Sync Ryzen 7 16GB RAM 1TB SSD RTX 3070', 'Image_URL': ['https://i.ebayimg.com/images/g/pxcAAOSwis1hwW4V/s-l2000.jpg', 'https://i.ebayimg.com/images/g/UWEAAOSwLslhwW4V/s-l2000.jpg', 'https://i.ebayimg.com/images/g/sOIAAOSwANNhwW4V/s-l2000.jpg', 'https://i.ebayimg.com/images/g/SOIAAOSwwORhwW4V/s-l2000.jpg', 'https://i.ebayimg.com/images/g/g7kAAOSwhzNhwW4V/s-l2000.jpg', 'https://i.ebayimg.com/images/g/HjsAAOSw6pxhvXmX/s-l2000.jpg', 'https://i.ebayimg.com/images/g/OSQAAOSwAvVhwW4V/s-l2000.jpg', 'https://i.ebayimg.com/images/g/pHAAAOSwjnJhwW4V/s-l2000.jpg', '//p.ebaystatic.com/aw/pics/cmp/icn/iconImgNA_96x96.gif', '//p.ebaystatic.com/aw/pics/cmp/icn/iconImgNA_96x96.gif']}
    <Response [200]>
    {'Title': '\ufeff\ufeffLenovo IdeaPad Gaming 3 15.6" 120Hz i5-11300H 8GB RAM 512GB SSD GTX 1650', 'Image_URL': {'https://i.ebayimg.com/images/g/sRAAAOSwKV9ia3Ia/s-l500.jpg'}}