google-api youtube-api youtube-data-api google-api-python-client

YouTube Playlist API does not return all videos in a channel

We are trying to get all videos in a channel, like this. This list has 291k videos, we figured out the channel id of this channel (and replaced the second alphabet "C" in the id by "U"), and trying this code, iterating over 50 videos at a time. We are getting only upto some 20k videos, not more than that. Any idea on how to fix this and get all 291k videos in this channel? Checked this for a variety of channels with large number of videos, all have the same problem.

api_key = "my Google YouTube API V3 key"
from googleapiclient.discovery import build
youtube = build('youtube', 'v3', developerKey=api_key)
def get_channel_videos():    
    videos = []
    next_page_token = None
    while 1:
        res = youtube.playlistItems().list(playlistId="UU...", 
                                           part='snippet', 
                                           maxResults=50,
                                           pageToken=next_page_token).execute()
        videos += res['items']
        next_page_token = res.get('nextPageToken')
        if next_page_token is None:
            break
    return videos
videos = get_channel_videos()
with open("video.txt", "a") as myfile:
    for video in videos:
        myfile.write(f"{video['snippet']['resourceId']['videoId']} => {video['snippet']['title']}\n")

print(f"Total video count => {len(videos)}")

Solution

I investigated many different approaches and the only one which seems to perfectly work is the following one based on web-scraping the Videos tab of the specified channel:

import requests
from lxml import html
import json

CHANNEL_HANDLE = '@MLB'
text = requests.get(f'https://www.youtube.com/{CHANNEL_HANDLE}/videos').text
tree = html.fromstring(text)

ytVariableName = 'ytInitialData'
ytVariableDeclaration = ytVariableName + ' = '
for script in tree.xpath('//script'):
    scriptContent = script.text_content()
    if ytVariableDeclaration in scriptContent:
        ytVariableData = json.loads(scriptContent.split(ytVariableDeclaration)[1][:-1])
        break

contents = ytVariableData['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content']['richGridRenderer']['contents']

videoIds = set()

def treatContents(contents):
    for content in contents:
        if not 'richItemRenderer' in content:
            break
        videoId = content['richItemRenderer']['content']['videoRenderer']['videoId']
        videoIds.add(videoId)
    print(len(videoIds))
    return getContinuationToken(contents)

def getContinuationToken(contents):
    # Sometimes have 29 actual results instead of 30.
    lastContent = contents[-1]
    if not 'continuationItemRenderer' in lastContent:
        return None
    return lastContent['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token']

continuationToken = treatContents(contents)
if continuationToken is not None:
    url = 'https://www.youtube.com/youtubei/v1/browse'
    headers = {
        'Content-Type': 'application/json'
    }
    requestData = {
        'context': {
            'client': {
                'clientName': 'WEB',
                'clientVersion': '2.20240313.05.00'
            }
        }
    }
    while True:
        requestData['continuation'] = continuationToken
        data = requests.post(url, headers = headers, json = requestData).json()
        # Happens not deterministically sometimes.
        if not 'onResponseReceivedActions' in data:
            print('Retrying')
            continue
        continuationItems = data['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']
        continuationToken = treatContents(continuationItems)
        if continuationToken is None:
            break

While @MLB About claims 291,597 videos, my method finds 289,814 unique videos. It is unknown where the count difference comes from, possibly from Lives and unlisted videos.