pythonpandasweb-scrapingbeautifulsouppython-requests

Scraping dynamic data table with no easy references


I'm trying to get the data from a simple table from the following website (https://bvmf.bmfbovespa.com.br/clube-de-investimento/clube-de-investimento.aspx?Idioma=pt-br). I was able to get the data from the first page, but as we can see the pagination it's not linked to the URL and I couldn't get it, even though i could find the buttons at the bottom of the page "ProximoPaginacao" and "MeioPaginacao", but i couldn't handle this implementation. Any ideas?

import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_table_data(url, table_id):

try:
    response = requests.get(url,verify=False)
    response.raise_for_status()
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table', id=table_id)
    if not table:
        print(f"Table with ID '{table_id}' not found.")
        return None

    # Extract header row
    header_row = [th.get_text(strip=True) for th in table.find_all('th')]

    # Extract data rows
    data_rows = []
    for row in table.find('tbody').find_all('tr'):
        data_rows.append([td.get_text(strip=True) for td in row.find_all('td')])

    # Create DataFrame
    df = pd.DataFrame(data_rows, columns=header_row)
    return df
except requests.exceptions.RequestException as e:
    print(f"Error during requests: {e}")
    return None
except Exception as e:
    print(f"An error occurred: {e}")
    return None

# Example usage
url = "https://bvmf.bmfbovespa.com.br/clube-de-investimento/clube-de-investimento.aspx? 
Idioma=pt-br"  # Replace with the actual URL
table_id = "ctl00_contentPlaceHolderConteudo_grdAtivo_ctl01"  # Replace with the actual 
table ID
table_data = extract_table_data(url, table_id)

if table_data is not None:
   print(table_data)

Solution

  • You can get all the pages using requests & BeautifulSoup, without selenium:

    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    
    
    def extract_all_tables(url):
        requests.packages.urllib3.disable_warnings()
        headers = {
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36',
        }
        data = {}
    
        tables = []
        while True:
            print(f'Scraping table #{len(tables) + 1}')
            response = requests.post(url, headers=headers, data=data, verify=False)
            response.raise_for_status()
    
            soup = BeautifulSoup(response.text, 'html.parser')
            table = soup.select_one('table#ctl00_contentPlaceHolderConteudo_grdAtivo_ctl01')
            header_row = [th.get_text(strip=True) for th in table.thead.select('th')]
            data_rows = [[td.get_text(strip=True) for td in tr.select('td')] for tr in table.tbody.select('tr')]
            
            df = pd.DataFrame(data_rows, columns=header_row)
            tables.append(df)
    
            next_button = table.tfoot.select_one('td.ProximoPaginacao > input')
            if not next_button:
                break
    
            data['__VIEWSTATE'] = soup.select_one('input#__VIEWSTATE').get('value')
            data['__EVENTTARGET'] = next_button.get('name').replace('$', ':')
    
        return tables
    
    
    url = 'https://bvmf.bmfbovespa.com.br/clube-de-investimento/clube-de-investimento.aspx?idioma=pt-br'
    tables = extract_all_tables(url)
    print(f'{len(tables) = }')