I am working on a project where I have to scrap subreddit using PRAW. But I have to put limit so that it will scrap only that many posts. For example, if I want to scrap a subreddit gaming (https://www.reddit.com/r/gaming/) I have to give limit 100 so it scrap for first 100 posts. But instead, I want first the total number of posts in gaming subreddit and then that value I can set as a limit to extract all the posts. I have searched on internet about Pushshift API, but don't know how to do that. Any help is appreciated!
Following code:
import praw
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
from psaw import PushshiftAPI
load_dotenv(find_dotenv())
#Creating a dataframe
df = pd.DataFrame(columns=['Title', 'Number of comments', 'Comments'])
#Instance of subreddit to be web scraped
reddit_read_only = praw.Reddit(client_id = os.environ.get("client_id"),
client_secret = os.environ.get("client_secret"),
user_agent = os.environ.get("user_agent"))
def main(name, value):
i = 0
subreddit = reddit_read_only.subreddit(name)
print(subreddit.created)
while i < value:
#Limits the scrapping for value number of posts
for submission in subreddit.hot(limit=value):
submission.comments.replace_more(limit=(value*30))
lst = []
#If there are any comments, then it will be saved in dataframe
if submission.num_comments != 0:
for comment in submission.comments.list():
lst.append(comment.body)
df.loc[i] = [submission.title, submission.num_comments, lst]
#If there are no comments in a post, then No comments will be stored
elif submission.num_comments == 0:
df.loc[i] = [submission.title, submission.num_comments, ['No comments']]
i += 1
# print(df)
name = 'Reddit_web_scrap_'+str(name) #save the file with certain name
# df.to_csv(name + str('.csv'), index=False)
return name
if __name__ == "__main__":
print('#####################################################################')
print('############### Reddit Web Scrapping Started ########################')
print('#####################################################################')
print()
name = main('gaming', 50)
print()
print('Created {}.csv file!'.format(name))
print()
print('#####################################################################')
print('################# Reddit Web Scrapping Ended ########################')
print('#####################################################################')
I have put limit to 50 which will scrap first 50 posts. But I want to scrap all the posts that is available in gaming. If I put limit = "None", then it will throw me an error:
TypeError: '<' not supported between instances of 'int' and 'str'
And this is logical as well. So, I guess I won't be able to use limit = "None".
I have created a function total_posts() with the help of Pushshift API, that will give me total number of posts avaialble for a particular subreddit.
#Importing Dependencies
import praw
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
from pmaw import PushshiftAPI
load_dotenv(find_dotenv())
#Creating a dataframe
df = pd.DataFrame(columns=['Title', 'Number of comments', 'Comments'])
#Instance of subreddit to be web scraped
reddit_read_only = praw.Reddit(client_id = os.environ.get("client_id"),
client_secret = os.environ.get("client_secret"),
user_agent = os.environ.get("user_agent"))
def total_posts(name):
print("Calculating total number of posts")
print()
api = PushshiftAPI()
api_request_generator = api.search_submissions(subreddit='ChatGPT', score = ">=0")
aita_submissions = pd.DataFrame([submission for submission in api_request_generator])
print("Total number of posts in subreddit {} are {}".format(name, aita_submissions.shape[0]))
return aita_submissions.shape[0]
def main(name, value):
print('Creating dataframe')
print()
i = 0
subreddit = reddit_read_only.subreddit(name)
while i < value:
#Limits the scrapping for value number of posts
for submission in subreddit.hot(limit=value):
submission.comments.replace_more(limit=(value*30))
lst = []
#If there are any comments, then it will be saved in dataframe
if submission.num_comments != 0:
for comment in submission.comments.list():
lst.append(comment.body)
df.loc[i] = [submission.title, submission.num_comments, lst]
#If there are no comments in a post, then No comments will be stored
elif submission.num_comments == 0:
df.loc[i] = [submission.title, submission.num_comments, ['No comments']]
i += 1
print(df)
name = 'Reddit_web_scrap_'+str(name) #save the file with certain name
df.to_csv(name + str('.csv'), index=False)
if __name__ == "__main__":
subreddit_name = 'gaming'
print('#####################################################################')
print('#### Reddit Web Scrapping Started for {}'.format(subreddit_name) + '####')
print('#####################################################################')
print()
posts_number = total_posts(subreddit_name)
print()
main(subreddit_name, posts_number)
print()
print('Created {}.csv file!'.format(subreddit_name))
print()
print('#####################################################################')
print('################# Reddit Web Scrapping Ended ########################')
print('#####################################################################')