pythonweb-scrapingpython-beautifultable

long time web scrapping using beautifulsoup in python


I want to scrape the title and date of news articles from the OilPrice website using this code:

from selenium import webdriver
from bs4 import BeautifulSoup
import time
import requests
import os
title_list = []
date_list = []
os.chdir(r'C:\Users\Foued Azuz 14\mypythonfiles')
driver = webdriver.Chrome()
for page in range (104,110):
    url = f"https://oilprice.com/Energy/Crude-Oil/Page-{page}.html"
    driver.get(url)
    from bs4 import BeautifulSoup
    html = driver.page_source
    soup = BeautifulSoup(html, 'lxml')
    title= soup.find_all('div',class_='categoryArticle__content')
    for i in title:
            d = i.find('h2', class_='categoryArticle__title').text
            h = i.find('p', class_='categoryArticle__meta')
            date_str = h.get_text()
            date = date_str.split(' at ')[0]
            if 'oil' in d.lower():
                title_list.append(d)
                date_list.append(date)

data = {'date': date_list, 'Title': title_list}
scrapdataoil = pd.DataFrame(data)
scrapdataoil.to_csv('scrapdataoil104110.csv', index=False)
 

The code works fine; however, my issue is that it takes a long time to loop from one page to another. Is there any way to speed up the execution since I need to loop through many pages?

I want to extract the date and title from news articles, but the algorithm is time-consuming.


Solution

  • You don't need BeautifulSoup if you're using selenium

    Here's a robust solution for acquiring the data you're interested in. Uses multithreading for improved performance

    from selenium import webdriver
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    import pandas as pd
    import time
    from selenium.webdriver import ChromeOptions
    from concurrent.futures import ThreadPoolExecutor
    
    DWAIT = 5
    OUTPUT_FILE = "foo.csv"
    URL = "https://oilprice.com/Energy/Crude-Oil/Page-{}.html"
    
    def get_text(element):
        if element:
            if t := element.get_attribute("textContent"):
                return t.strip()
        return ""
    
    def get_element(driver, selector):
        return driver.find_element(By.CSS_SELECTOR, selector)
    
    def main(page):
        _date = []
        _title = []
        with webdriver.Chrome(options=options) as driver:
            driver.get(URL.format(page))
            articles = WebDriverWait(driver, DWAIT).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.categoryArticle")))
            for article in articles:
                title = get_text(get_element(article, "h2.categoryArticle__title"))
                if title and "oil" in title.lower():
                    meta = get_text(get_element(article, "p.categoryArticle__meta"))
                    if meta:
                        _title.append(title)
                        _date.append(" ".join(meta.split()[:3]))
        return _title, _date
                
    if __name__ == "__main__":
        start = time.time()
        data = {
                "Date": [],
                "Title": []
            }
        options = ChromeOptions()
        options.add_argument("--headless")
        with ThreadPoolExecutor() as exe:
            for _title, _date in exe.map(main, range(104, 110)):
                data["Title"].extend(_title)
                data["Date"].extend(_date)
        pd.DataFrame(data).to_csv(OUTPUT_FILE, index=False)
        end = time.time()
        print(f"Duration={end-start:.2f}s")
    

    Output:

    3.70s
    

    Update:

    The website being scraped does not appear to depend on JavaScript to the extent that selenium might be required.

    Using multithreading and a combination of requests and BeautifulSoup we can improve performance significantly as follows:

    import requests
    from bs4 import BeautifulSoup as BS
    from concurrent.futures import ThreadPoolExecutor
    import time
    import pandas as pd
    from functools import partial
    
    OUTPUT_FILE = "foo.csv"
    URL = "https://oilprice.com/Energy/Crude-Oil/Page-{}.html"
    
    def main(session, page):
        titles = []
        dates = []
        with session.get(URL.format(page)) as response:
            response.raise_for_status()
            soup = BS(response.text, "lxml")
            for article in soup.select("div.categoryArticle"):
                if title := article.select_one("h2.categoryArticle__title"):
                    if (title := title.text) and "oil" in title.lower():
                        if meta := article.select_one("p.categoryArticle__meta"):
                            titles.append(title.strip())
                            dates.append(" ".join(meta.text.strip().split()[:3]))
        return titles, dates
    
    if __name__ == "__main__":
        start = time.time()
        data = {
                "Date": [],
                "Title": []
            }
        with requests.Session() as session:
            with ThreadPoolExecutor() as exe:
                for _title, _date in exe.map(partial(main, session), range(104, 110)):
                    data["Title"].extend(_title)
                    data["Date"].extend(_date)
            pd.DataFrame(data).to_csv(OUTPUT_FILE, index=False)
            end = time.time()
            print(f"Duration={end-start:.2f}s")
    

    Output:

    0.90s