pythonselenium-webdriverweb-scrapingweb-crawler

Scraping/Crawling a website with multiple tabs using python


I am seeking assistance in extracting data from a website with multiple tabs and saving it in a .csv format using Python and Selenium. The website in question is: https://www.amfiindia.com/research-information/other-data/mf-scheme-performance-details.

There are five different tabs on the page, but my focus is on extracting data from the first three tabs.

1st Tab: First tab has 2 different options

2nd Tab: Second tab has 5 different options

3rd Tab: Third Tab has multiple different options

Additionally, there are two more tabs, one representing "ALL" and the other representing the "date." I need to retrieve data for all combinations of the first three tabs while keeping the "ALL" tab selected and the date set to the current date.

I was attempting to perform this operation using Selenium, but due to my limited experience with the tool, I was unable to achieve the desired outcome. Therefore, I am seeking guidance on how to proceed.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import random

def wait_for_element(driver, by, value, timeout=10):
    return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))

def scrape_and_save(driver, end_type, equity_type, cap_type, all_type, filename):
    # Select options from dropdowns
    Select(wait_for_element(driver, By.ID, "end-type")).select_by_value(end_type)
    time.sleep(random.uniform(1, 2))
    Select(wait_for_element(driver, By.ID, "equity-type")).select_by_value(equity_type)
    time.sleep(random.uniform(1, 2))
    Select(wait_for_element(driver, By.ID, "cap-type")).select_by_value(cap_type)
    time.sleep(random.uniform(1, 2))
    Select(wait_for_element(driver, By.ID, "all-type")).select_by_value(all_type)
    time.sleep(random.uniform(1, 2))
    
    # Click "Go" button
    wait_for_element(driver, By.ID, "go-button").click()
    
    # Wait for table to load
    table = wait_for_element(driver, By.ID, "fund-table", timeout=15)
    
    # Extract table data
    df = pd.read_html(table.get_attribute('outerHTML'))[0]
    
    # Save to CSV
    df.to_csv(filename, index=False)
    print(f"Saved data to {filename}")

# Set up Selenium WebDriver
driver = webdriver.Chrome()  # Make sure you have chromedriver installed and in PATH
driver.get("https://www.amfiindia.com/research-information/other-data/mf-scheme-performance-details")  # Replace with actual URL

# Wait for initial page load
wait_for_element(driver, By.ID, "end-type", timeout=30)
print("Page loaded successfully")

# Define options for each dropdown
end_types = ["1", "2"]  # Open-ended, Closed-end
equity_types = ["1", "2", "3", "4", "5", "6"]  # Replace with actual values
cap_types = ["1", "2", "3", "4"]  # Replace with actual values
all_types = ["1", "2", "3", "4"]  # Replace with actual values

# Iterate through combinations
for end in end_types:
    for equity in equity_types:
        for cap in cap_types:
            for all_type in all_types:
                filename = f"fund_data_{end}_{equity}_{cap}_{all_type}.csv"
                try:
                    scrape_and_save(driver, end, equity, cap, all_type, filename)
                    time.sleep(random.uniform(3, 5))  # Random wait between 3 to 5 seconds
                except Exception as e:
                    print(f"Error scraping combination {end}_{equity}_{cap}_{all_type}: {str(e)}")

driver.quit()

Solution

  • Your target app loads the table page from this app https://www.valueresearchonline.com/amfi via Iframe, in this case, we can easily extract those data with the iframe page https://www.valueresearchonline.com/amfi using bs4, here is the sample code with bs4 (bs4 is faster than selenium in this case):

    Code:

    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    
    def fetchValue(primary_category, category, file):
        fund_name = []
        fund_benchmark = []
        riskometer_scheme = []
        riskometer_benchmark = []
        latest_nav_regular = []
        latest_nav_direct = []
        five_year_return_regular = []
        five_year_return_direct = []
        five_year_return_benchmark = []
        daily_aum_cr = []
    
        url = f'https://www.valueresearchonline.com/amfi/fund-performance-data/?end-type=1&primary-category={primary_category}&category={category}&amc=ALL&nav-date=25-Oct-2024'
        resp = requests.get(url,headers={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0","Referer":"https://www.valueresearchonline.com/amfi/fund-performance"}).text
        soup = BeautifulSoup(resp, 'lxml')
        table = soup.findAll('tr')
        for i in table:
            try:
                content = i.findAll('td')
                fund_name.append(content[0].text.strip())
                fund_benchmark.append(content[1].text)
                riskometer_scheme.append(content[2].text)
                riskometer_benchmark.append(content[3].text)
                latest_nav_regular.append(content[4].text.strip())
                latest_nav_direct.append(content[5].text.strip())
                five_year_return_regular.append(content[6].text.strip())
                five_year_return_direct.append(content[7].text.strip())
                five_year_return_benchmark.append(content[8].text.strip())
                daily_aum_cr.append(content[10].text.strip())
            except Exception:
                pass
        a = {
            "Scheme": fund_name,
            "Benchmark": fund_benchmark,
            "Riskometer_Scheme": riskometer_scheme,
            "Riskometer_Benchmark": riskometer_benchmark,
            "Lates_Nav_Regular": latest_nav_regular,
            "Lates_Nav_Direct": latest_nav_direct,
            "Five_Year_Retrun_Regular": five_year_return_regular,
            "Five_Year_Retrun_Direct": five_year_return_direct,
            "Five_Year_Retrun_Benchmark": five_year_return_benchmark,
            "Daily_AUM": daily_aum_cr
        }
    
        df = pd.DataFrame(a)
        df.to_csv(file, index=False)
    
    url = "https://www.valueresearchonline.com/amfi/fund-performance"
    resp = requests.get(url, headers={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0","Referer":"https://www.amfiindia.com/"}).text
    soup = BeautifulSoup(resp, 'lxml')
    
    category_list = soup.find('select', id='category')
    for i in range(40):# has 40 category combinations on table 2-3 
        category = category_list.findAll('option')[i]['value']
        primary_category = category.split('_')[0]
        fetchValue(primary_category, category, f'{category}.csv')
    

    I tried as basic as possible with my code for better understanding