pythonseleniumweb-scrapingretry-logicretrywhen

python selenium webscraping. How to keep retrying when network is down or the scraped website is slow?


I am trying to scrape a table from web using selenium in python. But the website is very slow and there are many network issues most of the time. So I would like the code to keep trying even if the website takes time to load. I have to scrape 941 entries to scrape. I tried this module named retry I found online, but it seems not to be work. Giving a sample of the code below. Is there any other way to make the code keep retrying until the website loads?

import pandas as pd
import io
import time
from selenium import webdriver 
from webdriver_manager.firefox import GeckoDriverManager
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())

from selenium.webdriver.support.ui import Select
from retry import retry

# Web page url 
driver.get("http://mnregaweb4.nic.in/netnrega/dynamic_work_details.aspx?page=S&lflag=eng&state_name=KERALA&state_code=16&fin_year=2020-2021&source=national&Digest=s5wXOIOkT98cNVkcwF6NQA") 
@retry()
def make_trouble():
    '''Retry until succeed'''
driver.implicitly_wait(5)  
# Find District of option 
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_dist') 
drop = Select(x) 

# Select by value 
drop.select_by_value("1613")
@retry()
def make_trouble():
    '''Retry until succeed'''
time.sleep(6) 

# Find Block of option 
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_blk') 
drop = Select(x) 
  
# Select by value 
drop.select_by_value("1613001")
@retry()
def make_trouble():
    '''Retry until succeed'''
time.sleep(4) 

# Find GP of option 
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_pan') 
drop = Select(x) 
  
# Select by value 
drop.select_by_value("1613001001")
@retry()
def make_trouble():
    '''Retry until succeed'''
time.sleep(4) 


search_button = driver.find_element_by_id("ContentPlaceHolder1_Button1")
search_button.click()
time.sleep(8)

soup = BeautifulSoup(driver.page_source, 'lxml')
tables = soup.find_all('table')
dfs = pd.read_html(str(tables))

print(dfs[4])

df1 = pd.read_csv(io.StringIO(dfs[4].to_csv(index=False)), skiprows=1, header=[0,1])
df1.to_csv("test with pandas V3.csv", index=False)
driver.close()```

Solution

  • This is not my code but as requested by ABC Admin this modification of the code by Sangun Devkota.

    This way it prints an error every 5 loops.

    x = 0
    while True:
        try:
            driver.implicitly_wait(5)  
            # Find District of option 
            x = driver.find_element_by_id('ContentPlaceHolder1_ddl_dist') 
            drop = Select(x) 
    
            # Select by value 
            drop.select_by_value("1613")
    
            time.sleep(6) 
    
            # Find Block of option 
            x = driver.find_element_by_id('ContentPlaceHolder1_ddl_blk') 
            drop = Select(x) 
    
            # Select by value 
            drop.select_by_value("1613001")
    
            time.sleep(4) 
    
            # Find GP of option 
            x = driver.find_element_by_id('ContentPlaceHolder1_ddl_pan') 
            drop = Select(x) 
    
            # Select by value 
            drop.select_by_value("1613001001")
    
            time.sleep(4) 
    
    
            search_button = driver.find_element_by_id("ContentPlaceHolder1_Button1")
            search_button.click()
            time.sleep(8)
    
            soup = BeautifulSoup(driver.page_source, 'lxml')
            tables = soup.find_all('table')
            dfs = pd.read_html(str(tables))
    
            print(dfs[4])
    
            df1 = pd.read_csv(io.StringIO(dfs[4].to_csv(index=False)), skiprows=1, header=[0,1])
            df1.to_csv("test with pandas V3.csv", index=False)
            driver.close()
        except: 
            if x%5 == 0:
                print("Error")
            x += 1
    

    If you want it to only print once you can change it to this:

    x = True
    

    ... Other code ...

    except:
       if x:
          print('Error')
          x = False