pythonweb-scrapingpython-requests

How to Bypass HTTP 403 Error When Scraping CoinGecko with Python?


I am trying to scrape the Bitcoin markets section from CoinGecko using Python. However, I keep encountering a HTTP 403 error. I have tried using the requests library with custom headers to mimic a real browser, but I still get the same error.

Here is the code I am using:

import requests
import pandas as pd

# Base URL for Bitcoin markets on CoinGecko
base_url = "https://www.coingecko.com/en/coins/bitcoin"

# Function to fetch a single page
def fetch_page(url, page):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest"
    }
    response = requests.get(f"{url}?page={page}", headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch page {page}: Status code {response.status_code}")
        return None
    return response.text

# Function to extract market data from a page
def extract_markets(html):
    dfs = pd.read_html(html)
    return dfs[0] if dfs else pd.DataFrame()

# Main function to scrape all pages
def scrape_all_pages(base_url, max_pages=10):
    all_markets = []
    for page in range(1, max_pages + 1):
        print(f"Scraping page {page}...")
        html = fetch_page(base_url, page)
        if html is None:
            break
        df = extract_markets(html)
        if df.empty:
            break
        all_markets.append(df)

    return pd.concat(all_markets, ignore_index=True) if all_markets else pd.DataFrame()

# Scrape data and store in a DataFrame
max_pages = 10  # Adjust this to scrape more pages if needed
df = scrape_all_pages(base_url, max_pages)

# Display the DataFrame
print(df)

error:

Scraping page 1...
Failed to fetch page 1: Status code 403
Empty DataFrame
Columns: []
Index: []

I also tried a suggested solution on stackoverflow, but it did not resolve the issue.

Could someone suggest a workaround or a more effective way to scrape this data? Any help would be greatly appreciated. Thank you in advance.


Solution

  • As suggested in response to your other question, your requests are being identified as bot traffic and hence the 403 error. Use Playwright to access the site via a browser.

    import time
    from io import StringIO
    
    from playwright.sync_api import sync_playwright
    from bs4 import BeautifulSoup
    import pandas as pd
    
    # 🚨 Specific URL for getting Bitcoin markets.
    #
    URL = f"https://www.coingecko.com/en/coins/1/markets/spot"
    
    playwright = sync_playwright().start()
    browser = playwright.chromium.launch(headless=False, slow_mo=2000)
    context = browser.new_context(
        viewport={"width": 1280, "height": 900}
    )
    
    page = context.new_page()
    
    def fetch_page(url):
        print(url)
        page.goto(url)
        time.sleep(5)
    
        return page.content()
    
    def scrape_all_pages(url, max_pages=10):
        markets = []
        for page in range(1, max_pages + 1):
            html = fetch_page(f"{url}?page={page}")
    
            df = pd.read_html(StringIO(html))
            markets.extend(df)
    
        return pd.concat(markets, ignore_index=True)
    
    max_pages = 10
    df = scrape_all_pages(URL, max_pages)
    
    page.close()
    
    df = df.dropna(how='all')
    
    print(df)
    

    Top of output:

           #             Exchange Unnamed: 2       Pair       Price Spread    +2% Depth    -2% Depth      24h Volume Volume % Last Updated  Trust Score
    1    1.0              Binance        CEX   BTC/USDT  $61,578.60  0.01%  $14,564,938  $19,766,330  $1,226,281,740    5.72%     Recently          NaN
    2    2.0    Coinbase Exchange        CEX    BTC/USD  $61,570.63  0.01%  $15,912,548  $14,809,605    $667,341,947    3.12%     Recently          NaN
    3    3.0               Kraken        CEX    BTC/USD  $61,584.00  0.01%  $13,621,680  $13,100,698     $50,592,315    0.24%     Recently          NaN
    4    4.0              Gate.io        CEX   BTC/USDT  $61,584.36  0.01%  $12,523,800  $11,856,866    $202,923,100    0.95%     Recently          NaN
    5    5.0              Binance        CEX  BTC/FDUSD  $61,568.21  0.01%   $8,355,196   $8,489,839  $1,901,656,552    8.88%     Recently          NaN
    6    6.0                  OKX        CEX   BTC/USDT  $61,588.33  0.01%   $4,552,443  $13,952,016    $398,284,635    1.86%     Recently          NaN
    7    7.0               Bitget        CEX   BTC/USDT  $61,580.27  0.01%   $8,598,359   $8,848,635    $239,239,284    1.12%     Recently          NaN
    8    8.0               Kraken        CEX    BTC/EUR  $61,589.72  0.01%   $7,703,734   $7,050,064     $27,293,519    0.13%     Recently          NaN
    9    9.0                Bybit        CEX   BTC/USDT  $61,583.98  0.01%   $2,208,077   $1,347,924  $1,103,150,476    5.15%     Recently          NaN
    10  10.0               Pionex        CEX   BTC/USDT  $61,588.25  0.01%  $17,409,820  $15,637,094    $224,747,215    1.05%     Recently          NaN
    12  11.0              Binance        CEX   WBTC/BTC  $61,700.03  0.02%   $5,446,446  $20,784,820      $8,023,065    0.04%     Recently          NaN
    13  12.0  Crypto.com Exchange        CEX    BTC/USD  $61,588.55  0.01%   $2,539,978   $5,433,264    $359,030,532    1.68%     Recently          NaN
    14  13.0  Crypto.com Exchange        CEX   BTC/USDT  $61,581.55  0.01%   $2,635,507   $5,917,342    $267,921,739    1.25%     Recently          NaN
    15  14.0              Binance        CEX    ETH/BTC   $3,449.52  0.02%   $7,954,561   $9,115,420     $73,631,305    0.34%     Recently          NaN
    16  15.0                LBank        CEX   BTC/USDT  $61,588.16  0.01%  $12,152,948  $12,658,338    $401,602,782    1.87%     Recently          NaN
    17  16.0                 MEXC        CEX   BTC/USDT  $61,578.24  0.01%   $1,693,904   $2,007,183    $477,125,505    2.23%     Recently          NaN
    18  17.0           CoinTR Pro        CEX   BTC/USDT  $61,578.39  0.01%   $9,294,010   $3,947,978    $158,178,596    0.74%     Recently          NaN
    19  18.0             Bitfinex        CEX   BTC/USDT  $61,586.63  0.02%   $5,122,254   $5,538,887      $7,127,016    0.03%     Recently          NaN
    20  19.0              Binance        CEX    BNB/BTC     $581.02  0.01%   $1,040,939   $5,591,340     $13,706,709    0.06%     Recently          NaN
    21  20.0                Dcoin        CEX   BTC/USDT  $61,583.99  0.02%   $5,939,915   $5,770,785     $20,137,964    0.09%     Recently          NaN
    

    Here's an asynchronous implementation too.

    import asyncio
    from playwright.async_api import async_playwright
    from bs4 import BeautifulSoup
    import pandas as pd
    from io import StringIO
    
    URL = f"https://www.coingecko.com/en/coins/1/markets/spot"
    
    async def fetch_page(page, url):
        print(f"Fetching: {url}")
        await page.goto(url)
        await asyncio.sleep(5)
        return await page.content()
    
    async def scrape_all_pages(url, max_pages=10):
        async with async_playwright() as playwright:
            browser = await playwright.chromium.launch(headless=False, slow_mo=2000)
            context = await browser.new_context(viewport={"width": 1280, "height": 900})
            page = await context.new_page()
    
            markets = []
            for page_num in range(1, max_pages + 1):
                full_url = f"{url}?page={page_num}"
                html = await fetch_page(page, full_url)
    
                try:
                    dfs = pd.read_html(StringIO(html))
                    markets.extend(dfs)
                except ValueError as e:
                    print(f"No tables found on page {page_num}: {e}")
    
            await page.close()
            await browser.close()
    
            return pd.concat(markets, ignore_index=True)
    
    async def main():
        max_pages = 10
        df = await scrape_all_pages(URL, max_pages)
    
        df = df.dropna(how="all")
    
        print(df)
    
    if __name__ == "__main__":
        asyncio.run(main())