I'm trying to get the data from a simple table from the following website (https://bvmf.bmfbovespa.com.br/clube-de-investimento/clube-de-investimento.aspx?Idioma=pt-br). I was able to get the data from the first page, but as we can see the pagination it's not linked to the URL and I couldn't get it, even though i could find the buttons at the bottom of the page "ProximoPaginacao" and "MeioPaginacao", but i couldn't handle this implementation. Any ideas?
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract_table_data(url, table_id):
try:
response = requests.get(url,verify=False)
response.raise_for_status()
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find('table', id=table_id)
if not table:
print(f"Table with ID '{table_id}' not found.")
return None
# Extract header row
header_row = [th.get_text(strip=True) for th in table.find_all('th')]
# Extract data rows
data_rows = []
for row in table.find('tbody').find_all('tr'):
data_rows.append([td.get_text(strip=True) for td in row.find_all('td')])
# Create DataFrame
df = pd.DataFrame(data_rows, columns=header_row)
return df
except requests.exceptions.RequestException as e:
print(f"Error during requests: {e}")
return None
except Exception as e:
print(f"An error occurred: {e}")
return None
# Example usage
url = "https://bvmf.bmfbovespa.com.br/clube-de-investimento/clube-de-investimento.aspx?
Idioma=pt-br" # Replace with the actual URL
table_id = "ctl00_contentPlaceHolderConteudo_grdAtivo_ctl01" # Replace with the actual
table ID
table_data = extract_table_data(url, table_id)
if table_data is not None:
print(table_data)
You can get all the pages using requests & BeautifulSoup, without selenium:
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract_all_tables(url):
requests.packages.urllib3.disable_warnings()
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36',
}
data = {}
tables = []
while True:
print(f'Scraping table #{len(tables) + 1}')
response = requests.post(url, headers=headers, data=data, verify=False)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.select_one('table#ctl00_contentPlaceHolderConteudo_grdAtivo_ctl01')
header_row = [th.get_text(strip=True) for th in table.thead.select('th')]
data_rows = [[td.get_text(strip=True) for td in tr.select('td')] for tr in table.tbody.select('tr')]
df = pd.DataFrame(data_rows, columns=header_row)
tables.append(df)
next_button = table.tfoot.select_one('td.ProximoPaginacao > input')
if not next_button:
break
data['__VIEWSTATE'] = soup.select_one('input#__VIEWSTATE').get('value')
data['__EVENTTARGET'] = next_button.get('name').replace('$', ':')
return tables
url = 'https://bvmf.bmfbovespa.com.br/clube-de-investimento/clube-de-investimento.aspx?idioma=pt-br'
tables = extract_all_tables(url)
print(f'{len(tables) = }')