pythonseleniumselenium-webdriverweb-scraping

How to continue to scrape data from where the loop is broken due to errors in between using python selenium


enter image description hereim using selenium in python to scrape data from sciencedirect website. im able to scrape data but at somepoint of time there is a new window opening in the driver and the code is broken after extracting data from few hundred articles. i want to know if its possible to start extracting from where the code is broken

#Importing libraries
import requests
import os
import json
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup  
import time
import requests
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
sciencedirect_list=[]

options = webdriver.ChromeOptions() 

options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)

driver = webdriver.Chrome(options=options, 
 executable_path=r"C:\Selenium\chromedriver_win32\chromedriver.exe")

links=['https://www.sciencedirect.com/journal/journal-of-banking-and-finance/issues?page=1','https://www.sciencedirect.com/journal/journal-of-banking-and-finance/issues?page=2','https://www.sciencedirect.com/journal/journal-of-banking-and-finance/issues?page=3']

for i in links:
    driver = webdriver.Chrome(options=options, 
    executable_path=r"C:\Selenium\chromedriver_win32\chromedriver.exe")
    driver.get(i)
    sleep(4)
    accordions = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "li.accordion-panel.js-accordion-panel>button.accordion-panel-title>span")))
    for accordion in accordions:
        ActionChains(driver).move_to_element(accordion).click(accordion).perform()

    issues = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "a.anchor.js-issue-item-link.text-m span.anchor-text")))
    window0  = driver.current_window_handle
    for issue in issues:
        ActionChains(driver).key_down(Keys.CONTROL).click(issue).key_up(Keys.CONTROL).perform()
        WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(2))
        windows_after = driver.window_handles
        window1 = [x for x in windows_after if x != window0][0]
        driver.switch_to_window(window1)
        articles = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "a.anchor.article-content-title.u-margin-xs-top.u-margin-s-bottom span.anchor-text")))
        windows2=driver.current_window_handle
        for article in articles:
            ActionChains(driver).key_down(Keys.CONTROL).click(article).key_up(Keys.CONTROL).perform()
            WebDriverWait(driver, 10).until(EC.number_of_windows_to_be(3))
            windows_after1 = driver.window_handles
            window2 = driver.window_handles[2]
            driver.switch_to_window(window2)
            sleep(3)
            sa={}
            try:
                sa["title"]=driver.find_element_by_xpath('//*[@id="screen-reader-main-title"]/span').text
            except:
                print("no title")
            try:
                sa["link"]=driver.find_element_by_xpath('//*[@id="doi-link"]/a[1]').text
            except:
                print("no link")
            try:
                sa["abstract"]=driver.find_element_by_xpath('//*[@id="ab0005"]').text
            except:
                print("no abstract")
            try:
                sa["highlights"]=driver.find_element_by_xpath('//*[@id="ab0010"]').text
            except:
                print("highlights not found")
            try:
                sa["k/c"]=driver.find_element_by_xpath('//*[@id="ks0010"]').text
            except:
                print("no keywords or classifications")
            try:
                sa["c/k"]=driver.find_element_by_xpath('//*[@id="ks0005"]').text
            except:
                print("no keywords or classifications")
            try:
                sa["body"]=driver.find_element_by_xpath('//*[@id="body"]').text
            except:
                print("no body")

        sciencedirect_list.append(sa)
        driver.close()
        driver.switch_to_window(window1)

    driver.close()
    driver.switch_to_window(window0)


driver.close()

And also it would be very helpful if someone could give an efficient version of this code


Solution

  • When working with windows if you can assume that when opening new windows you go from left to right and when closing you go from right to left then you can easily switch to the latest window that was open using driver.switch_to_window(driver.window_handles[-1])

    If I were you, I would just call that every time a click opens a new tab, and every time you do driver.close().

    I took the liberty to format your code a little bit, but in the process I took out the flows for opening and closing new tabs. Hope that its still actual for you. Its much faster than the original code, and if you add threading to it and launch it in headless mode it will be actually pretty good.

    The idea is to get all the links from issue pages, then get all the article links and then go to each article and get the data you wanted.

    Also, I used webdriver_manager as I did not want to download new chromedriver manually.

    import traceback
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from webdriver_manager.chrome import ChromeDriverManager  # pip install webdriver_manager
    
    
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
    driver.set_page_load_timeout(20)
    
    links=['https://www.sciencedirect.com/journal/journal-of-banking-and-finance/issues?page=1',
           'https://www.sciencedirect.com/journal/journal-of-banking-and-finance/issues?page=2',
           'https://www.sciencedirect.com/journal/journal-of-banking-and-finance/issues?page=3']
    
    def get_url_and_wait_for_page_load(_driver, url):
        _driver.get(url)
        WebDriverWait(driver, 30).until(
            EC.visibility_of_all_elements_located((By.XPATH, "//div[@class = 'usabilla_live_button_container']")))
    
    volume_links, issue_parse_failed = [], []  # you can reprocess failed list if you wish
    for link in links:
        try:
            print "Looking for volume links @ {}".format(link)
            get_url_and_wait_for_page_load(driver, link)
            driver.execute_script('var accordions = document.getElementsByClassName("accordion-panel-title"); '
                                  'for(accordion in accordions) if(accordion > 1) accordions[accordion].click();')
            volume_links += [link.get_attribute("href")
                             for link in driver.find_elements_by_xpath("//a[contains(@class, 'js-issue-item-link')]")]
            print "Total volume links: {}".format(len(volume_links))
        except:
            print "Failed to get volume links @ {}".format(link)
            issue_parse_failed.append(link)
    
    article_links, volume_parse_failed = [], []  # you can reprocess failed list if you wish
    for volume_link in volume_links:
        try:
            print "Looking for article links @ {}".format(volume_link)
            get_url_and_wait_for_page_load(driver, volume_link)
            article_links += [link.get_attribute("href")
                              for link in driver.find_elements_by_xpath("//a[contains(@class, 'article-content-title')]")]
            print "Total article links: {}".format(len(article_links))
        except:
            print traceback.format_exc()
            print "Failed to get article links @ {}".format(volume_link)
            volume_parse_failed.append(volume_link)
    
    sciencedirect_list, article_parse_failed = [], []
    for article_link in article_links:
        try:
            print "Extracting data for article @ {}".format(article_link)
            get_url_and_wait_for_page_load(driver, article_link)
            sa = {}
            for item in [{"title": '//*[@id="screen-reader-main-title"]/span',
                          "link": '//*[@id="doi-link"]/a[1]',
                          "abstract": '//*[@id="ab0005"]',
                          "highlights": '//*[@id="ab0010"]',
                          "k/c": '//*[@id="ks0010"]',
                          "c/k": '//*[@id="ks0005"]',
                          "body": '//*[@id="body"]'}]:
                for name, xpath in item.items():
                    try:
                        sa[name] = driver.find_element_by_xpath(xpath).text
                    except:
                        sa[name] = None
            sciencedirect_list.append(sa)
        except:
            print "Failed to extract article's data  @ {}".format(article_link)
            article_parse_failed.append(article_link)
    
    driver.close()