google-apiyoutube-apiyoutube-data-apigoogle-api-python-client

YouTube Playlist API does not return all videos in a channel


We are trying to get all videos in a channel, like this. This list has 291k videos, we figured out the channel id of this channel (and replaced the second alphabet "C" in the id by "U"), and trying this code, iterating over 50 videos at a time. We are getting only upto some 20k videos, not more than that. Any idea on how to fix this and get all 291k videos in this channel? Checked this for a variety of channels with large number of videos, all have the same problem.

api_key = "my Google YouTube API V3 key"
from googleapiclient.discovery import build
youtube = build('youtube', 'v3', developerKey=api_key)
def get_channel_videos():    
    videos = []
    next_page_token = None
    while 1:
        res = youtube.playlistItems().list(playlistId="UU...", 
                                           part='snippet', 
                                           maxResults=50,
                                           pageToken=next_page_token).execute()
        videos += res['items']
        next_page_token = res.get('nextPageToken')
        if next_page_token is None:
            break
    return videos
videos = get_channel_videos()
with open("video.txt", "a") as myfile:
    for video in videos:
        myfile.write(f"{video['snippet']['resourceId']['videoId']} => {video['snippet']['title']}\n")

print(f"Total video count => {len(videos)}")

Solution

  • I investigated many different approaches and the only one which seems to perfectly work is the following one based on web-scraping the Videos tab of the specified channel:

    import requests
    from lxml import html
    import json
    
    CHANNEL_HANDLE = '@MLB'
    text = requests.get(f'https://www.youtube.com/{CHANNEL_HANDLE}/videos').text
    tree = html.fromstring(text)
    
    ytVariableName = 'ytInitialData'
    ytVariableDeclaration = ytVariableName + ' = '
    for script in tree.xpath('//script'):
        scriptContent = script.text_content()
        if ytVariableDeclaration in scriptContent:
            ytVariableData = json.loads(scriptContent.split(ytVariableDeclaration)[1][:-1])
            break
    
    contents = ytVariableData['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content']['richGridRenderer']['contents']
    
    videoIds = set()
    
    def treatContents(contents):
        for content in contents:
            if not 'richItemRenderer' in content:
                break
            videoId = content['richItemRenderer']['content']['videoRenderer']['videoId']
            videoIds.add(videoId)
        print(len(videoIds))
        return getContinuationToken(contents)
    
    def getContinuationToken(contents):
        # Sometimes have 29 actual results instead of 30.
        lastContent = contents[-1]
        if not 'continuationItemRenderer' in lastContent:
            return None
        return lastContent['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token']
    
    continuationToken = treatContents(contents)
    if continuationToken is not None:
        url = 'https://www.youtube.com/youtubei/v1/browse'
        headers = {
            'Content-Type': 'application/json'
        }
        requestData = {
            'context': {
                'client': {
                    'clientName': 'WEB',
                    'clientVersion': '2.20240313.05.00'
                }
            }
        }
        while True:
            requestData['continuation'] = continuationToken
            data = requests.post(url, headers = headers, json = requestData).json()
            # Happens not deterministically sometimes.
            if not 'onResponseReceivedActions' in data:
                print('Retrying')
                continue
            continuationItems = data['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']
            continuationToken = treatContents(continuationItems)
            if continuationToken is None:
                break
    

    While @MLB About claims 291,597 videos, my method finds 289,814 unique videos. It is unknown where the count difference comes from, possibly from Lives and unlisted videos.