I want to scrape the title and date of news articles from the OilPrice website using this code:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import requests
import os
title_list = []
date_list = []
os.chdir(r'C:\Users\Foued Azuz 14\mypythonfiles')
driver = webdriver.Chrome()
for page in range (104,110):
url = f"https://oilprice.com/Energy/Crude-Oil/Page-{page}.html"
driver.get(url)
from bs4 import BeautifulSoup
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
title= soup.find_all('div',class_='categoryArticle__content')
for i in title:
d = i.find('h2', class_='categoryArticle__title').text
h = i.find('p', class_='categoryArticle__meta')
date_str = h.get_text()
date = date_str.split(' at ')[0]
if 'oil' in d.lower():
title_list.append(d)
date_list.append(date)
data = {'date': date_list, 'Title': title_list}
scrapdataoil = pd.DataFrame(data)
scrapdataoil.to_csv('scrapdataoil104110.csv', index=False)
The code works fine; however, my issue is that it takes a long time to loop from one page to another. Is there any way to speed up the execution since I need to loop through many pages?
I want to extract the date and title from news articles, but the algorithm is time-consuming.
You don't need BeautifulSoup if you're using selenium
Here's a robust solution for acquiring the data you're interested in. Uses multithreading for improved performance
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import time
from selenium.webdriver import ChromeOptions
from concurrent.futures import ThreadPoolExecutor
DWAIT = 5
OUTPUT_FILE = "foo.csv"
URL = "https://oilprice.com/Energy/Crude-Oil/Page-{}.html"
def get_text(element):
if element:
if t := element.get_attribute("textContent"):
return t.strip()
return ""
def get_element(driver, selector):
return driver.find_element(By.CSS_SELECTOR, selector)
def main(page):
_date = []
_title = []
with webdriver.Chrome(options=options) as driver:
driver.get(URL.format(page))
articles = WebDriverWait(driver, DWAIT).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.categoryArticle")))
for article in articles:
title = get_text(get_element(article, "h2.categoryArticle__title"))
if title and "oil" in title.lower():
meta = get_text(get_element(article, "p.categoryArticle__meta"))
if meta:
_title.append(title)
_date.append(" ".join(meta.split()[:3]))
return _title, _date
if __name__ == "__main__":
start = time.time()
data = {
"Date": [],
"Title": []
}
options = ChromeOptions()
options.add_argument("--headless")
with ThreadPoolExecutor() as exe:
for _title, _date in exe.map(main, range(104, 110)):
data["Title"].extend(_title)
data["Date"].extend(_date)
pd.DataFrame(data).to_csv(OUTPUT_FILE, index=False)
end = time.time()
print(f"Duration={end-start:.2f}s")
Output:
3.70s
Update:
The website being scraped does not appear to depend on JavaScript to the extent that selenium might be required.
Using multithreading and a combination of requests and BeautifulSoup we can improve performance significantly as follows:
import requests
from bs4 import BeautifulSoup as BS
from concurrent.futures import ThreadPoolExecutor
import time
import pandas as pd
from functools import partial
OUTPUT_FILE = "foo.csv"
URL = "https://oilprice.com/Energy/Crude-Oil/Page-{}.html"
def main(session, page):
titles = []
dates = []
with session.get(URL.format(page)) as response:
response.raise_for_status()
soup = BS(response.text, "lxml")
for article in soup.select("div.categoryArticle"):
if title := article.select_one("h2.categoryArticle__title"):
if (title := title.text) and "oil" in title.lower():
if meta := article.select_one("p.categoryArticle__meta"):
titles.append(title.strip())
dates.append(" ".join(meta.text.strip().split()[:3]))
return titles, dates
if __name__ == "__main__":
start = time.time()
data = {
"Date": [],
"Title": []
}
with requests.Session() as session:
with ThreadPoolExecutor() as exe:
for _title, _date in exe.map(partial(main, session), range(104, 110)):
data["Title"].extend(_title)
data["Date"].extend(_date)
pd.DataFrame(data).to_csv(OUTPUT_FILE, index=False)
end = time.time()
print(f"Duration={end-start:.2f}s")
Output:
0.90s