I'm trying to scrape google shopping and get all products price, image, url but run into a problem when i tried find('img').get('src')
it returns
data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAO
w==
My code so far:
import requests
from bs4 import BeautifulSoup as bs
import json
from rich import print
def gg_shopping(query):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'}
s = requests.Session()
s.headers.update(headers)
big_list = []
search = query.replace(' ', '+')
r = s.get(f'https://www.google.com/search?q={search}&tbm=shop')
soup = bs(r.text, 'lxml')
all_prods = soup.find_all('div', 'sh-dgr__gr-auto')
for prod in all_prods:
link = "https://www.google.com" + str(prod.find('a').get('href'))
img = prod.find('img').get('src')
print(img)
def main():
gg_shopping("may tinh")
if __name__ == "__main__":
main()
I am not sure how BeautifulSoup
code is related to Selenium.
Seemed like Google detects that you using it for scrapping purposes, so it loads fake structure to BS.
To get images, using Selenium, you just need to wait for presence of images with locator div.sh-dgr__gr-auto img[src]
and get their src
property.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
def gg_shopping(query):
search = query.replace(' ', '+')
url = f'https://www.google.com/search?q={search}&tbm=shop'
driver.get(url)
images = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.sh-dgr__gr-auto img[src]')))
for image in images:
print(image.get_property('src'))
gg_shopping('cat')