I am trying to scrape the Bitcoin markets section from CoinGecko using Python. However, I keep encountering a HTTP 403 error. I have tried using the requests library with custom headers to mimic a real browser, but I still get the same error.
Here is the code I am using:
import requests
import pandas as pd
# Base URL for Bitcoin markets on CoinGecko
base_url = "https://www.coingecko.com/en/coins/bitcoin"
# Function to fetch a single page
def fetch_page(url, page):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
response = requests.get(f"{url}?page={page}", headers=headers)
if response.status_code != 200:
print(f"Failed to fetch page {page}: Status code {response.status_code}")
return None
return response.text
# Function to extract market data from a page
def extract_markets(html):
dfs = pd.read_html(html)
return dfs[0] if dfs else pd.DataFrame()
# Main function to scrape all pages
def scrape_all_pages(base_url, max_pages=10):
all_markets = []
for page in range(1, max_pages + 1):
print(f"Scraping page {page}...")
html = fetch_page(base_url, page)
if html is None:
break
df = extract_markets(html)
if df.empty:
break
all_markets.append(df)
return pd.concat(all_markets, ignore_index=True) if all_markets else pd.DataFrame()
# Scrape data and store in a DataFrame
max_pages = 10 # Adjust this to scrape more pages if needed
df = scrape_all_pages(base_url, max_pages)
# Display the DataFrame
print(df)
error:
Scraping page 1...
Failed to fetch page 1: Status code 403
Empty DataFrame
Columns: []
Index: []
I also tried a suggested solution on stackoverflow, but it did not resolve the issue.
Could someone suggest a workaround or a more effective way to scrape this data? Any help would be greatly appreciated. Thank you in advance.
As suggested in response to your other question, your requests are being identified as bot traffic and hence the 403 error. Use Playwright to access the site via a browser.
import time
from io import StringIO
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import pandas as pd
# 🚨 Specific URL for getting Bitcoin markets.
#
URL = f"https://www.coingecko.com/en/coins/1/markets/spot"
playwright = sync_playwright().start()
browser = playwright.chromium.launch(headless=False, slow_mo=2000)
context = browser.new_context(
viewport={"width": 1280, "height": 900}
)
page = context.new_page()
def fetch_page(url):
print(url)
page.goto(url)
time.sleep(5)
return page.content()
def scrape_all_pages(url, max_pages=10):
markets = []
for page in range(1, max_pages + 1):
html = fetch_page(f"{url}?page={page}")
df = pd.read_html(StringIO(html))
markets.extend(df)
return pd.concat(markets, ignore_index=True)
max_pages = 10
df = scrape_all_pages(URL, max_pages)
page.close()
df = df.dropna(how='all')
print(df)
Top of output:
# Exchange Unnamed: 2 Pair Price Spread +2% Depth -2% Depth 24h Volume Volume % Last Updated Trust Score
1 1.0 Binance CEX BTC/USDT $61,578.60 0.01% $14,564,938 $19,766,330 $1,226,281,740 5.72% Recently NaN
2 2.0 Coinbase Exchange CEX BTC/USD $61,570.63 0.01% $15,912,548 $14,809,605 $667,341,947 3.12% Recently NaN
3 3.0 Kraken CEX BTC/USD $61,584.00 0.01% $13,621,680 $13,100,698 $50,592,315 0.24% Recently NaN
4 4.0 Gate.io CEX BTC/USDT $61,584.36 0.01% $12,523,800 $11,856,866 $202,923,100 0.95% Recently NaN
5 5.0 Binance CEX BTC/FDUSD $61,568.21 0.01% $8,355,196 $8,489,839 $1,901,656,552 8.88% Recently NaN
6 6.0 OKX CEX BTC/USDT $61,588.33 0.01% $4,552,443 $13,952,016 $398,284,635 1.86% Recently NaN
7 7.0 Bitget CEX BTC/USDT $61,580.27 0.01% $8,598,359 $8,848,635 $239,239,284 1.12% Recently NaN
8 8.0 Kraken CEX BTC/EUR $61,589.72 0.01% $7,703,734 $7,050,064 $27,293,519 0.13% Recently NaN
9 9.0 Bybit CEX BTC/USDT $61,583.98 0.01% $2,208,077 $1,347,924 $1,103,150,476 5.15% Recently NaN
10 10.0 Pionex CEX BTC/USDT $61,588.25 0.01% $17,409,820 $15,637,094 $224,747,215 1.05% Recently NaN
12 11.0 Binance CEX WBTC/BTC $61,700.03 0.02% $5,446,446 $20,784,820 $8,023,065 0.04% Recently NaN
13 12.0 Crypto.com Exchange CEX BTC/USD $61,588.55 0.01% $2,539,978 $5,433,264 $359,030,532 1.68% Recently NaN
14 13.0 Crypto.com Exchange CEX BTC/USDT $61,581.55 0.01% $2,635,507 $5,917,342 $267,921,739 1.25% Recently NaN
15 14.0 Binance CEX ETH/BTC $3,449.52 0.02% $7,954,561 $9,115,420 $73,631,305 0.34% Recently NaN
16 15.0 LBank CEX BTC/USDT $61,588.16 0.01% $12,152,948 $12,658,338 $401,602,782 1.87% Recently NaN
17 16.0 MEXC CEX BTC/USDT $61,578.24 0.01% $1,693,904 $2,007,183 $477,125,505 2.23% Recently NaN
18 17.0 CoinTR Pro CEX BTC/USDT $61,578.39 0.01% $9,294,010 $3,947,978 $158,178,596 0.74% Recently NaN
19 18.0 Bitfinex CEX BTC/USDT $61,586.63 0.02% $5,122,254 $5,538,887 $7,127,016 0.03% Recently NaN
20 19.0 Binance CEX BNB/BTC $581.02 0.01% $1,040,939 $5,591,340 $13,706,709 0.06% Recently NaN
21 20.0 Dcoin CEX BTC/USDT $61,583.99 0.02% $5,939,915 $5,770,785 $20,137,964 0.09% Recently NaN
Here's an asynchronous implementation too.
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
URL = f"https://www.coingecko.com/en/coins/1/markets/spot"
async def fetch_page(page, url):
print(f"Fetching: {url}")
await page.goto(url)
await asyncio.sleep(5)
return await page.content()
async def scrape_all_pages(url, max_pages=10):
async with async_playwright() as playwright:
browser = await playwright.chromium.launch(headless=False, slow_mo=2000)
context = await browser.new_context(viewport={"width": 1280, "height": 900})
page = await context.new_page()
markets = []
for page_num in range(1, max_pages + 1):
full_url = f"{url}?page={page_num}"
html = await fetch_page(page, full_url)
try:
dfs = pd.read_html(StringIO(html))
markets.extend(dfs)
except ValueError as e:
print(f"No tables found on page {page_num}: {e}")
await page.close()
await browser.close()
return pd.concat(markets, ignore_index=True)
async def main():
max_pages = 10
df = await scrape_all_pages(URL, max_pages)
df = df.dropna(how="all")
print(df)
if __name__ == "__main__":
asyncio.run(main())