pythonweb-scrapingredditpraw

How to get total number of posts of a subreddit using Python?


I am working on a project where I have to scrap subreddit using PRAW. But I have to put limit so that it will scrap only that many posts. For example, if I want to scrap a subreddit gaming (https://www.reddit.com/r/gaming/) I have to give limit 100 so it scrap for first 100 posts. But instead, I want first the total number of posts in gaming subreddit and then that value I can set as a limit to extract all the posts. I have searched on internet about Pushshift API, but don't know how to do that. Any help is appreciated!

Following code:

import praw
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
from psaw import PushshiftAPI

load_dotenv(find_dotenv())

#Creating a dataframe
df = pd.DataFrame(columns=['Title', 'Number of comments', 'Comments'])

#Instance of subreddit to be web scraped
reddit_read_only = praw.Reddit(client_id = os.environ.get("client_id"),
                                client_secret = os.environ.get("client_secret"),
                                user_agent = os.environ.get("user_agent"))

def main(name, value):
    i = 0
    subreddit = reddit_read_only.subreddit(name)
    print(subreddit.created)
    while i < value:
        #Limits the scrapping for value number of posts
        for submission in subreddit.hot(limit=value):
            submission.comments.replace_more(limit=(value*30))
            lst = []
            #If there are any comments, then it will be saved in dataframe
            if submission.num_comments != 0:
                for comment in submission.comments.list():
                    lst.append(comment.body) 
                df.loc[i] = [submission.title, submission.num_comments, lst]
            
            #If there are no comments in a post, then No comments will be stored 
            elif submission.num_comments == 0:
                df.loc[i] = [submission.title, submission.num_comments, ['No comments']]
            i += 1
    # print(df)
    name = 'Reddit_web_scrap_'+str(name) #save the file with certain name
    # df.to_csv(name + str('.csv'), index=False)

    return name

if __name__ == "__main__":

    print('#####################################################################')
    print('############### Reddit Web Scrapping Started ########################')
    print('#####################################################################')
    print()
    name = main('gaming', 50)
    print()
    print('Created {}.csv file!'.format(name))
    print()
    print('#####################################################################')
    print('################# Reddit Web Scrapping Ended ########################')
    print('#####################################################################')

I have put limit to 50 which will scrap first 50 posts. But I want to scrap all the posts that is available in gaming. If I put limit = "None", then it will throw me an error:

TypeError: '<' not supported between instances of 'int' and 'str'

And this is logical as well. So, I guess I won't be able to use limit = "None".


Solution

  • I have created a function total_posts() with the help of Pushshift API, that will give me total number of posts avaialble for a particular subreddit.

    #Importing Dependencies
    import praw
    import pandas as pd
    import os
    from dotenv import load_dotenv, find_dotenv
    from pmaw import PushshiftAPI
    
    load_dotenv(find_dotenv())
    
    #Creating a dataframe
    df = pd.DataFrame(columns=['Title', 'Number of comments', 'Comments'])
    
    #Instance of subreddit to be web scraped
    reddit_read_only = praw.Reddit(client_id = os.environ.get("client_id"),
                                    client_secret = os.environ.get("client_secret"),
                                    user_agent = os.environ.get("user_agent"))
    
    def total_posts(name):
        print("Calculating total number of posts")
        print()
        api = PushshiftAPI()
        api_request_generator = api.search_submissions(subreddit='ChatGPT', score = ">=0")
        aita_submissions = pd.DataFrame([submission for submission in api_request_generator])
        print("Total number of posts in subreddit {} are {}".format(name, aita_submissions.shape[0]))
    
        return aita_submissions.shape[0]
    
    def main(name, value):
        print('Creating dataframe')
        print()
        i = 0
        subreddit = reddit_read_only.subreddit(name)
        while i < value:
            #Limits the scrapping for value number of posts
            for submission in subreddit.hot(limit=value):
                submission.comments.replace_more(limit=(value*30))
                lst = []
                #If there are any comments, then it will be saved in dataframe
                if submission.num_comments != 0:
                    for comment in submission.comments.list():
                        lst.append(comment.body) 
                    df.loc[i] = [submission.title, submission.num_comments, lst]
                
                #If there are no comments in a post, then No comments will be stored 
                elif submission.num_comments == 0:
                    df.loc[i] = [submission.title, submission.num_comments, ['No comments']]
                i += 1
        print(df)
        name = 'Reddit_web_scrap_'+str(name) #save the file with certain name
        df.to_csv(name + str('.csv'), index=False)
    
    if __name__ == "__main__":
        
        subreddit_name = 'gaming'
    
        print('#####################################################################')
        print('#### Reddit Web Scrapping Started for {}'.format(subreddit_name) + '####')
        print('#####################################################################')
        print()
        posts_number = total_posts(subreddit_name)
        print()
        main(subreddit_name, posts_number)
        print()
        print('Created {}.csv file!'.format(subreddit_name))
        print()
        print('#####################################################################')
        print('################# Reddit Web Scrapping Ended ########################')
        print('#####################################################################')