I'm trying to retrieve all the posts from a subreddit that were created in 2020.
Below is the code I wrote to try to solve this
import praw
import pandas as pd
import datetime
subreddit = reddit_read_only.subreddit("SuicideWatch")
# Scraping the top posts of all time
posts = subreddit.top(time_filter = "all")
posts_dict = {"Title": [], "Post Text": [],
"ID": [], "Score": [], "Upvote Ratio": [],
"Total Comments": [],"Created On":[], "Post URL": [],
"Original Content": []
}
start_date = '01-01-20 00:00:00'
start_date = datetime.datetime.strptime(start_date, '%d-%m-%y %H:%M:%S').timestamp()
for post in posts:
# Date of each posts' creation
date = post.created_utc
if date > start_date:
# Title of each post
posts_dict["Title"].append(post.title)
# Text inside a post
posts_dict["Post Text"].append(post.selftext)
# Unique ID of each post
posts_dict["ID"].append(post.id)
# The score of a post
posts_dict["Score"].append(post.score)
# Upvote Ratio of a post
posts_dict["Upvote Ratio"].append(post.upvote_ratio)
# Total number of comments inside the post
posts_dict["Total Comments"].append(post.num_comments)
# Date the post was Created
posts_dict["Created On"].append(post.created_utc)
# URL of each post
posts_dict["Post URL"].append(post.url)
# Flair of each post
posts_dict["Original Content"].append(post.is_original_content)
# Saving the data in a pandas dataframe
all_posts = pd.DataFrame(posts_dict)
all_posts
I know that the limit is 100, however, my code is only returning 79 posts. I need a few hundred posts to build a proper dataset for a project.
I also noticed that the "Created On" column is a float. I used the following code to try to get it as a date. Is this correct?
all_posts["Created On"] = pd.to_datetime(all_posts["Created On"] * 1000000000).
I was able to solve this.
I modified one line to set the limit as None so it returned over the default limit of 100 posts.
posts = subreddit.top(time_filter = "all", limit = None)
This resulted in scraping 764 posts instead of just 79.
Afterwards I altered the "Created On" column into a date using the following:
all_posts['Created On'] = pd.to_datetime(all_posts['Created On'], unit='s')
Then I was able to filter for posts created in 2020-2021.
Full Code Below:
import praw
import pandas as pd
import datetime
subreddit = reddit_read_only.subreddit("SuicideWatch")
# Scraping the top posts of all time
posts = subreddit.top(time_filter = "all", limit = None)
posts_dict = {"Title": [], "Post Text": [],
"ID": [], "Score": [], "Upvote Ratio": [],
"Total Comments": [],"Created On":[], "Post URL": [],
"Original Content": []
}
start_date = '01-01-20 00:00:00'
start_date = datetime.datetime.strptime(start_date, '%d-%m-%y %H:%M:%S').timestamp()
for post in posts:
# Date of each posts' creation
date = post.created_utc
if date > start_date:
# Title of each post
posts_dict["Title"].append(post.title)
# Text inside a post
posts_dict["Post Text"].append(post.selftext)
# Unique ID of each post
posts_dict["ID"].append(post.id)
# The score of a post
posts_dict["Score"].append(post.score)
# Upvote Ratio of a post
posts_dict["Upvote Ratio"].append(post.upvote_ratio)
# Total number of comments inside the post
posts_dict["Total Comments"].append(post.num_comments)
# Date the post was Created
posts_dict["Created On"].append(post.created_utc)
# URL of each post
posts_dict["Post URL"].append(post.url)
# Flair of each post
posts_dict["Original Content"].append(post.is_original_content)
# Saving the data in a pandas dataframe
all_posts = pd.DataFrame(posts_dict)
all_posts['Created On'] = pd.to_datetime(all_posts['Created On'], unit='s')
all_posts