How to Bypass HTTP 403 Error When Scraping CoinGecko with Python?

I am trying to scrape the Bitcoin markets section from CoinGecko using Python. However, I keep encountering a HTTP 403 error. I have tried using the requests library with custom headers to mimic a real browser, but I still get the same error.

Here is the code I am using:

import requests
import pandas as pd

# Base URL for Bitcoin markets on CoinGecko
base_url = "https://www.coingecko.com/en/coins/bitcoin"

# Function to fetch a single page
def fetch_page(url, page):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest"
    }
    response = requests.get(f"{url}?page={page}", headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch page {page}: Status code {response.status_code}")
        return None
    return response.text

# Function to extract market data from a page
def extract_markets(html):
    dfs = pd.read_html(html)
    return dfs[0] if dfs else pd.DataFrame()

# Main function to scrape all pages
def scrape_all_pages(base_url, max_pages=10):
    all_markets = []
    for page in range(1, max_pages + 1):
        print(f"Scraping page {page}...")
        html = fetch_page(base_url, page)
        if html is None:
            break
        df = extract_markets(html)
        if df.empty:
            break
        all_markets.append(df)

    return pd.concat(all_markets, ignore_index=True) if all_markets else pd.DataFrame()

# Scrape data and store in a DataFrame
max_pages = 10  # Adjust this to scrape more pages if needed
df = scrape_all_pages(base_url, max_pages)

# Display the DataFrame
print(df)

error:

Scraping page 1...
Failed to fetch page 1: Status code 403
Empty DataFrame
Columns: []
Index: []

I also tried a suggested solution on stackoverflow, but it did not resolve the issue.

Could someone suggest a workaround or a more effective way to scrape this data? Any help would be greatly appreciated. Thank you in advance.

Solution

As suggested in response to your other question, your requests are being identified as bot traffic and hence the 403 error. Use Playwright to access the site via a browser.

import time
from io import StringIO

from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import pandas as pd

# 🚨 Specific URL for getting Bitcoin markets.
#
URL = f"https://www.coingecko.com/en/coins/1/markets/spot"

playwright = sync_playwright().start()
browser = playwright.chromium.launch(headless=False, slow_mo=2000)
context = browser.new_context(
    viewport={"width": 1280, "height": 900}
)

page = context.new_page()

def fetch_page(url):
    print(url)
    page.goto(url)
    time.sleep(5)

    return page.content()

def scrape_all_pages(url, max_pages=10):
    markets = []
    for page in range(1, max_pages + 1):
        html = fetch_page(f"{url}?page={page}")

        df = pd.read_html(StringIO(html))
        markets.extend(df)

    return pd.concat(markets, ignore_index=True)

max_pages = 10
df = scrape_all_pages(URL, max_pages)

page.close()

df = df.dropna(how='all')

print(df)

Top of output:

       #             Exchange Unnamed: 2       Pair       Price Spread    +2% Depth    -2% Depth      24h Volume Volume % Last Updated  Trust Score
1    1.0              Binance        CEX   BTC/USDT  $61,578.60  0.01%  $14,564,938  $19,766,330  $1,226,281,740    5.72%     Recently          NaN
2    2.0    Coinbase Exchange        CEX    BTC/USD  $61,570.63  0.01%  $15,912,548  $14,809,605    $667,341,947    3.12%     Recently          NaN
3    3.0               Kraken        CEX    BTC/USD  $61,584.00  0.01%  $13,621,680  $13,100,698     $50,592,315    0.24%     Recently          NaN
4    4.0              Gate.io        CEX   BTC/USDT  $61,584.36  0.01%  $12,523,800  $11,856,866    $202,923,100    0.95%     Recently          NaN
5    5.0              Binance        CEX  BTC/FDUSD  $61,568.21  0.01%   $8,355,196   $8,489,839  $1,901,656,552    8.88%     Recently          NaN
6    6.0                  OKX        CEX   BTC/USDT  $61,588.33  0.01%   $4,552,443  $13,952,016    $398,284,635    1.86%     Recently          NaN
7    7.0               Bitget        CEX   BTC/USDT  $61,580.27  0.01%   $8,598,359   $8,848,635    $239,239,284    1.12%     Recently          NaN
8    8.0               Kraken        CEX    BTC/EUR  $61,589.72  0.01%   $7,703,734   $7,050,064     $27,293,519    0.13%     Recently          NaN
9    9.0                Bybit        CEX   BTC/USDT  $61,583.98  0.01%   $2,208,077   $1,347,924  $1,103,150,476    5.15%     Recently          NaN
10  10.0               Pionex        CEX   BTC/USDT  $61,588.25  0.01%  $17,409,820  $15,637,094    $224,747,215    1.05%     Recently          NaN
12  11.0              Binance        CEX   WBTC/BTC  $61,700.03  0.02%   $5,446,446  $20,784,820      $8,023,065    0.04%     Recently          NaN
13  12.0  Crypto.com Exchange        CEX    BTC/USD  $61,588.55  0.01%   $2,539,978   $5,433,264    $359,030,532    1.68%     Recently          NaN
14  13.0  Crypto.com Exchange        CEX   BTC/USDT  $61,581.55  0.01%   $2,635,507   $5,917,342    $267,921,739    1.25%     Recently          NaN
15  14.0              Binance        CEX    ETH/BTC   $3,449.52  0.02%   $7,954,561   $9,115,420     $73,631,305    0.34%     Recently          NaN
16  15.0                LBank        CEX   BTC/USDT  $61,588.16  0.01%  $12,152,948  $12,658,338    $401,602,782    1.87%     Recently          NaN
17  16.0                 MEXC        CEX   BTC/USDT  $61,578.24  0.01%   $1,693,904   $2,007,183    $477,125,505    2.23%     Recently          NaN
18  17.0           CoinTR Pro        CEX   BTC/USDT  $61,578.39  0.01%   $9,294,010   $3,947,978    $158,178,596    0.74%     Recently          NaN
19  18.0             Bitfinex        CEX   BTC/USDT  $61,586.63  0.02%   $5,122,254   $5,538,887      $7,127,016    0.03%     Recently          NaN
20  19.0              Binance        CEX    BNB/BTC     $581.02  0.01%   $1,040,939   $5,591,340     $13,706,709    0.06%     Recently          NaN
21  20.0                Dcoin        CEX   BTC/USDT  $61,583.99  0.02%   $5,939,915   $5,770,785     $20,137,964    0.09%     Recently          NaN

Here's an asynchronous implementation too.

import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

URL = f"https://www.coingecko.com/en/coins/1/markets/spot"

async def fetch_page(page, url):
    print(f"Fetching: {url}")
    await page.goto(url)
    await asyncio.sleep(5)
    return await page.content()

async def scrape_all_pages(url, max_pages=10):
    async with async_playwright() as playwright:
        browser = await playwright.chromium.launch(headless=False, slow_mo=2000)
        context = await browser.new_context(viewport={"width": 1280, "height": 900})
        page = await context.new_page()

        markets = []
        for page_num in range(1, max_pages + 1):
            full_url = f"{url}?page={page_num}"
            html = await fetch_page(page, full_url)

            try:
                dfs = pd.read_html(StringIO(html))
                markets.extend(dfs)
            except ValueError as e:
                print(f"No tables found on page {page_num}: {e}")

        await page.close()
        await browser.close()

        return pd.concat(markets, ignore_index=True)

async def main():
    max_pages = 10
    df = await scrape_all_pages(URL, max_pages)

    df = df.dropna(how="all")

    print(df)

if __name__ == "__main__":
    asyncio.run(main())