pythonseleniumweb-scrapingbeautifulsoupchrome-web-driver

Web Scraping TimeOutException: Message:


I am trying to scrape an ecommerce website. I would like to scrape the product description of every product from the search results. I successfully scrape all the product links from the search results and get the product description of one product. However, when I try to loop the product links to get the product description from all of the products that I get from the search results, the TimeOutException: Message is coming up.

I already try to change the time of the WebDriverWait and it doesn't fix the error.

Any idea what should I do?

Here is my code:

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options       # to customize chrome display
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC 
from time import sleep
from collections import Counter
import json
from turtle import delay
import time

# create object for chrome options
chrome_options = Options()

# Customize chrome display
chrome_options.add_argument('start-maximized')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1365,4597.190")
chrome_options.add_argument('--disable-infobars')      

# create webdriver object
path = '/Applications/chromedriver'
webdriver_service = Service(path)
driver = webdriver.Chrome(executable_path=path, options=chrome_options)

baseurl = 'https://shopee.co.id/search?keyword=obat%20kanker'

product_links = []

for page in range(0,6):
    search_link = 'https://shopee.co.id/search?keyword=obat%20kanker&page={}'.format(page)
    driver.get(search_link)
    WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item")))

    driver.execute_script("""
            var scroll = document.body.scrollHeight / 10;
            var i = 0;
            function scrollit(i) {
            window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
            i++;
            if (i < 10) {
                setTimeout(scrollit, 500, i);
                }
            }
            scrollit(i);
            """)
    sleep(5)
    html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
    soup = BeautifulSoup(html, "html.parser")

    product_list = soup.find_all('div',class_='col-xs-2-4 shopee-search-item-result__item' )
    for item in product_list:
        for link in item.find_all('a', href=True):
            product_links.append(baseurl + link['href'])



for link in product_links:
        driver.get(link)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "_2VZg1J")))

        driver.execute_script("""
                var scroll = document.body.scrollHeight / 10;
                var i = 0;
                function scrollit(i) {
                window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
                i++;
                if (i < 10) {
                setTimeout(scrollit, 500, i);
                }
                }
                scrollit(i);
                """)

        sleep(20)
        html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
        soup = BeautifulSoup(html, "html.parser")

        name = soup.find('div', class_='_2rQP1z').text.replace('Star+','')
        price = soup.find('div', class_='_2Shl1j').text.replace('Rp','')
        sold = soup.find('div', class_ = 'HmRxgn').text.strip()
        rate = soup.find('div', class_ = '_3y5XOB _14izon').text.strip()
        city = soup.find('span', class_ = '_2fJrvA').text.strip()
        specification = soup.find('div', class_ = '_2jz573').text.strip()

        herbcancer = {
                'name': name,
                'price': price,
                'sold': sold,
                'rate': rate,
                'city': city,
                'specification': specification
        }

        print(herbcancer)

Solution

  • Base url is incorrect that why they show you TimeOutException:

    https://shopee.co.id/search?keyword=obat%20kanker
    

    The correct base url is:

    https://shopee.co.id
    

    Complete Code is :

    import requests
    from bs4 import BeautifulSoup
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.chrome.options import Options       # to customize chrome display
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC 
    from time import sleep
    from collections import Counter
    import json
    from turtle import delay
    import time
    
    # create object for chrome options
    chrome_options = Options()
    
    # Customize chrome display
    chrome_options.add_argument('start-maximized')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('disable-notifications')
    chrome_options.add_argument("window-size=1365,4597.190")
    chrome_options.add_argument('--disable-infobars')      
    
    # create webdriver object
    path = ''
    webdriver_service = Service(path)
    driver = webdriver.Chrome(executable_path=path, options=chrome_options)
    
    baseurl = 'https://shopee.co.id'
    
    product_links = []
    
    for page in range(0,6):
        search_link = 'https://shopee.co.id/search?keyword=obat%20kanker&page={}'.format(page)
        driver.get(search_link)
        WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item")))
    
        driver.execute_script("""
                var scroll = document.body.scrollHeight / 10;
                var i = 0;
                function scrollit(i) {
                window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
                i++;
                if (i < 10) {
                    setTimeout(scrollit, 500, i);
                    }
                }
                scrollit(i);
                """)
        sleep(5)
        html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
        soup = BeautifulSoup(html, "html.parser")
    
        product_list = soup.find_all('div',class_='col-xs-2-4 shopee-search-item-result__item' )
        for item in product_list:
            for link in item.find_all('a', href=True):
                comp=baseurl + link['href']
                product_links.append(comp)
                
                
        for link in product_links:
            driver.get(link)
            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "_2VZg1J")))
    
            driver.execute_script("""
                    var scroll = document.body.scrollHeight / 10;
                    var i = 0;
                    function scrollit(i) {
                    window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
                    i++;
                    if (i < 10) {
                    setTimeout(scrollit, 500, i);
                    }
                    }
                    scrollit(i);
                    """)
    
            sleep(3)
            html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
            soup = BeautifulSoup(html, "html.parser")
    
            name = soup.find('div', class_='_2rQP1z').text.replace('Star+','')
            price = soup.find('div', class_='_2Shl1j').text.replace('Rp','')
            sold = soup.find('div', class_ = 'HmRxgn').text.strip()
            rate = soup.find('div', class_ = '_3y5XOB _14izon').text.strip()
            
            try:
                city = soup.find('span', class_ = '_2fJrvA').text.strip()
            except:
                city=''
                
            try:
                specification = soup.find('div', class_ = '_2jz573').text.strip()
            except:
                specification=''
    
            herbcancer = {
                    'name': name,
                    'price': price,
                    'sold': sold,
                    'rate': rate,
                    'city': city,
                    'specification': specification
            }
    
            print(herbcancer)