I'm trying to retrieve all the posts from a subreddit that were created in 2020-2021.
Below is the code I wrote to try to solve this
import datetime
subreddit = reddit_read_only.subreddit("SuicideWatch")
# Scraping the top posts of all time
posts = subreddit.top("all")
posts_dict = {"Title": [], "Post Text": [],
"ID": [], "Score": [],
"Total Comments": [], "Post URL": []
}
start_date = '01-01-20 00:00:00'
start_date = datetime.datetime.strptime(start_date, '%d-%m-%y %H:%M:%S').timestamp()
end_date = '31-12-21 12:00:00'
end_date = datetime.datetime.strptime(end_date, '%d-%m-%y %H:%M:%S').timestamp()
for post in posts:
# Date of each posts' creation
date = post.created
if start_date < date < end_date:
# Title of each post
posts_dict["Title"].append(post.title)
# Text inside a post
posts_dict["Post Text"].append(post.selftext)
# Unique ID of each post
posts_dict["ID"].append(post.id)
# The score of a post
posts_dict["Score"].append(post.score)
# Total number of comments inside the post
posts_dict["Total Comments"].append(post.num_comments)
# URL of each post
posts_dict["Post URL"].append(post.url)
# Saving the data in a pandas dataframe
top_posts = pd.DataFrame(posts_dict)
top_posts
This worked however, it only returned 72 posts.
I was able to solve this. Since I wasn't able to search for posts between a set of dates. I just looked for posts after my start date so that date > start_date.
I also modified one line to set the limit as None so it returned over the default limit of 100 posts.
posts = subreddit.top(time_filter = "all", limit = None)
This resulted in scraping 764 posts instead of just 72.
Afterwards I altered the "Created On" column into a date using the following:
all_posts['Created On'] = pd.to_datetime(all_posts['Created On'], unit='s')
Then I was able to filter for posts created in 2020-2021.
Full Code Below:
import praw
import pandas as pd
import datetime
subreddit = reddit_read_only.subreddit("SuicideWatch")
# Scraping the top posts of all time
posts = subreddit.top(time_filter = "all", limit = None)
posts_dict = {"Title": [], "Post Text": [],
"ID": [], "Score": [], "Upvote Ratio": [],
"Total Comments": [],"Created On":[], "Post URL": [],
"Original Content": []
}
start_date = '01-01-20 00:00:00'
start_date = datetime.datetime.strptime(start_date, '%d-%m-%y %H:%M:%S').timestamp()
for post in posts:
# Date of each posts' creation
date = post.created_utc
if date > start_date:
# Title of each post
posts_dict["Title"].append(post.title)
# Text inside a post
posts_dict["Post Text"].append(post.selftext)
# Unique ID of each post
posts_dict["ID"].append(post.id)
# The score of a post
posts_dict["Score"].append(post.score)
# Upvote Ratio of a post
posts_dict["Upvote Ratio"].append(post.upvote_ratio)
# Total number of comments inside the post
posts_dict["Total Comments"].append(post.num_comments)
# Date the post was Created
posts_dict["Created On"].append(post.created_utc)
# URL of each post
posts_dict["Post URL"].append(post.url)
# Flair of each post
posts_dict["Original Content"].append(post.is_original_content)
# Saving the data in a pandas dataframe
all_posts = pd.DataFrame(posts_dict)
all_posts['Created On'] = pd.to_datetime(all_posts['Created On'], unit='s')
all_posts