How to add rows to dataframe in pandas

Building a scraper for news:

import feedparser
import pandas as pd
from datetime import datetime

archive = pd.read_csv("national_news_scrape.csv")
pd.set_option('display.max_colwidth', None)

# Your list of feeds
feeds = [{"type": "news","title": "BBC", "url": "http://feeds.bbci.co.uk/news/uk/rss.xml"},
        {"type": "news","title": "The Economist", "url": "https://www.economist.com/international/rss.xml"},    
        {"type": "news","title": "The New Statesman", "url": "https://www.newstatesman.com/feed"},    
        {"type": "news","title": "The New York Times", "url": "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"},
        {"type": "news","title": "Metro UK","url": "https://metro.co.uk/feed/"},
        {"type": "news", "title": "Evening Standard", "url": "https://www.standard.co.uk/rss.xml"},
        {"type": "news","title": "Daily Mail", "url": "https://www.dailymail.co.uk/articles.rss"},
        {"type": "news","title": "Sky News", "url": "https://news.sky.com/feeds/rss/home.xml"},
        {"type": "news", "title": "The Mirror", "url": "https://www.mirror.co.uk/news/?service=rss"},
        {"type": "news", "title": "The Sun", "url": "https://www.thesun.co.uk/news/feed/"},
        {"type": "news", "title": "Sky News", "url": "https://news.sky.com/feeds/rss/home.xml"},
        {"type": "news", "title": "The Guardian", "url": "https://www.theguardian.com/uk/rss"},
        {"type": "news", "title": "The Independent", "url": "https://www.independent.co.uk/news/uk/rss"},
        #{"type": "news", "title": "The Telegraph", "url": "https://www.telegraph.co.uk/news/rss.xml"},
        {"type": "news", "title": "The Times", "url": "https://www.thetimes.co.uk/?service=rss"}]

# Create an empty DataFrame to store the news
news_df = pd.DataFrame(columns=['source', 'title', 'date', 'summary', "url"])

# For each feed, parse it and add the news to the DataFrame
for feed in feeds:
    print(f"Scraping: {feed['title']}")
    d = feedparser.parse(feed['url'])
    for entry in d.entries:
        # Some feeds do not have 'summary' field, handle this case
        summary = entry.summary if hasattr(entry, 'summary') else ''
        url = entry.link
        # Add the news to the DataFrame
        news_df = news_df.append([{'source': feed['title']},
                                  {'title': entry.title},
                                   {"url": url},
                                  {'date': datetime(*entry.published_parsed[:6])},
                                  {'summary': summary}],
                                   ignore_index=True)

combined = pd.concat([news_df, archive]).drop_duplicates(subset=['summary'])
combined.date = pd.to_datetime(combined.date)
combined = combined.sort_values("date", ascending=False)

# Save the DataFrame to a CSV file
combined.to_csv('national_news_scrape.csv', index=False)

It was working but now I'm getting an error.

AttributeError: 'DataFrame' object has no attribute 'append'

I know that append has been deprecated but concat doesn't work either. Perplexed.

Solution

You have different options to achieve the same result as appending:

Using df.loc[df.shape[0]] = [new_data]
Using the pd.concat() twice, once to get the new data and then to concatenate it with the archive data
Using the same approach in option 2, but instead of two data frames, you can use a list to append all parsed data, then convert that into a data frame and concatenate it to the archive data frame

Here is a sample code with runtime measurements:

import pandas as pd
import numpy as np

new_data = np.random.randint(0, 100, size=(10000, 5))
archive = [[0, 0, 0, 0, 0]]
pd.DataFrame(archive).to_csv('./scratching.csv', index=False)

Option 1:

%%timeit

archive = pd.read_csv('./scratching.csv')

for i in range(new_data.shape[0]):
    archive.loc[archive.shape[0]] = [new_data[i,j] for j in range(new_data.shape[1])]

Runtime: 6.13 s ± 36.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Option 2:

%%timeit

temp_df = pd.DataFrame(columns=[0, 1, 2, 3, 4])
archive = pd.read_csv('./scratching.csv')

for i in range(new_data.shape[0]):
    data = [new_data[i,j] for j in range(new_data.shape[1])]
    data = pd.DataFrame([data])
    temp_df = pd.concat([temp_df, data], ignore_index=True)

archive = pd.concat([archive, temp_df], ignore_index=True)

Runtime: 4.61 s ± 24.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Option 3:

%%timeit

temp = []
archive = pd.read_csv('./scratching.csv')

for i in range(new_data.shape[0]):
    temp.append([new_data[i,j] for j in range(new_data.shape[1])])

temp = pd.DataFrame(temp)
archive = pd.concat([archive, temp], ignore_index=True)

Runtime: 85.9 ms ± 670 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)