Building a scraper for news:
import feedparser
import pandas as pd
from datetime import datetime
archive = pd.read_csv("national_news_scrape.csv")
pd.set_option('display.max_colwidth', None)
# Your list of feeds
feeds = [{"type": "news","title": "BBC", "url": "http://feeds.bbci.co.uk/news/uk/rss.xml"},
{"type": "news","title": "The Economist", "url": "https://www.economist.com/international/rss.xml"},
{"type": "news","title": "The New Statesman", "url": "https://www.newstatesman.com/feed"},
{"type": "news","title": "The New York Times", "url": "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"},
{"type": "news","title": "Metro UK","url": "https://metro.co.uk/feed/"},
{"type": "news", "title": "Evening Standard", "url": "https://www.standard.co.uk/rss.xml"},
{"type": "news","title": "Daily Mail", "url": "https://www.dailymail.co.uk/articles.rss"},
{"type": "news","title": "Sky News", "url": "https://news.sky.com/feeds/rss/home.xml"},
{"type": "news", "title": "The Mirror", "url": "https://www.mirror.co.uk/news/?service=rss"},
{"type": "news", "title": "The Sun", "url": "https://www.thesun.co.uk/news/feed/"},
{"type": "news", "title": "Sky News", "url": "https://news.sky.com/feeds/rss/home.xml"},
{"type": "news", "title": "The Guardian", "url": "https://www.theguardian.com/uk/rss"},
{"type": "news", "title": "The Independent", "url": "https://www.independent.co.uk/news/uk/rss"},
#{"type": "news", "title": "The Telegraph", "url": "https://www.telegraph.co.uk/news/rss.xml"},
{"type": "news", "title": "The Times", "url": "https://www.thetimes.co.uk/?service=rss"}]
# Create an empty DataFrame to store the news
news_df = pd.DataFrame(columns=['source', 'title', 'date', 'summary', "url"])
# For each feed, parse it and add the news to the DataFrame
for feed in feeds:
print(f"Scraping: {feed['title']}")
d = feedparser.parse(feed['url'])
for entry in d.entries:
# Some feeds do not have 'summary' field, handle this case
summary = entry.summary if hasattr(entry, 'summary') else ''
url = entry.link
# Add the news to the DataFrame
news_df = news_df.append([{'source': feed['title']},
{'title': entry.title},
{"url": url},
{'date': datetime(*entry.published_parsed[:6])},
{'summary': summary}],
ignore_index=True)
combined = pd.concat([news_df, archive]).drop_duplicates(subset=['summary'])
combined.date = pd.to_datetime(combined.date)
combined = combined.sort_values("date", ascending=False)
# Save the DataFrame to a CSV file
combined.to_csv('national_news_scrape.csv', index=False)
It was working but now I'm getting an error.
AttributeError: 'DataFrame' object has no attribute 'append'
I know that append has been deprecated but concat doesn't work either. Perplexed.
You have different options to achieve the same result as appending:
df.loc[df.shape[0]] = [new_data]
pd.concat()
twice, once to get the new data and then to concatenate it with the archive dataHere is a sample code with runtime measurements:
import pandas as pd
import numpy as np
new_data = np.random.randint(0, 100, size=(10000, 5))
archive = [[0, 0, 0, 0, 0]]
pd.DataFrame(archive).to_csv('./scratching.csv', index=False)
Option 1:
%%timeit
archive = pd.read_csv('./scratching.csv')
for i in range(new_data.shape[0]):
archive.loc[archive.shape[0]] = [new_data[i,j] for j in range(new_data.shape[1])]
Runtime: 6.13 s ± 36.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Option 2:
%%timeit
temp_df = pd.DataFrame(columns=[0, 1, 2, 3, 4])
archive = pd.read_csv('./scratching.csv')
for i in range(new_data.shape[0]):
data = [new_data[i,j] for j in range(new_data.shape[1])]
data = pd.DataFrame([data])
temp_df = pd.concat([temp_df, data], ignore_index=True)
archive = pd.concat([archive, temp_df], ignore_index=True)
Runtime: 4.61 s ± 24.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Option 3:
%%timeit
temp = []
archive = pd.read_csv('./scratching.csv')
for i in range(new_data.shape[0]):
temp.append([new_data[i,j] for j in range(new_data.shape[1])])
temp = pd.DataFrame(temp)
archive = pd.concat([archive, temp], ignore_index=True)
Runtime: 85.9 ms ± 670 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)