web-scrapingbeautifulsoupgoogle-shopping

BeautifulSoup how to get image src in google shopping


I'm trying to scrape google shopping and get all products price, image, url but run into a problem when i tried find('img').get('src') it returns


w==

My code so far:

import requests
from bs4 import BeautifulSoup as bs
import json
from rich import print


def gg_shopping(query):
  headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'}
  s = requests.Session()
  s.headers.update(headers)
  big_list = []
  search = query.replace(' ', '+')
  r = s.get(f'https://www.google.com/search?q={search}&tbm=shop')
  soup = bs(r.text, 'lxml')
  all_prods = soup.find_all('div', 'sh-dgr__gr-auto')
  for prod in all_prods:
    link = "https://www.google.com" + str(prod.find('a').get('href'))
    img = prod.find('img').get('src')
    print(img)

def main():
  gg_shopping("may tinh")

if __name__ == "__main__":
  main()

Solution

  • I am not sure how BeautifulSoup code is related to Selenium.

    Seemed like Google detects that you using it for scrapping purposes, so it loads fake structure to BS.

    To get images, using Selenium, you just need to wait for presence of images with locator div.sh-dgr__gr-auto img[src] and get their src property.

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait
    
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 10)
    
    def gg_shopping(query):
      search = query.replace(' ', '+')
      url = f'https://www.google.com/search?q={search}&tbm=shop'
      driver.get(url)
      images = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.sh-dgr__gr-auto img[src]')))
      for image in images:
        print(image.get_property('src'))
    
    gg_shopping('cat')