I am seeking assistance in extracting data from a website with multiple tabs and saving it in a .csv format using Python and Selenium. The website in question is: https://www.amfiindia.com/research-information/other-data/mf-scheme-performance-details.
There are five different tabs on the page, but my focus is on extracting data from the first three tabs.
Additionally, there are two more tabs, one representing "ALL" and the other representing the "date." I need to retrieve data for all combinations of the first three tabs while keeping the "ALL" tab selected and the date set to the current date.
I was attempting to perform this operation using Selenium, but due to my limited experience with the tool, I was unable to achieve the desired outcome. Therefore, I am seeking guidance on how to proceed.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import random
def wait_for_element(driver, by, value, timeout=10):
return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))
def scrape_and_save(driver, end_type, equity_type, cap_type, all_type, filename):
# Select options from dropdowns
Select(wait_for_element(driver, By.ID, "end-type")).select_by_value(end_type)
time.sleep(random.uniform(1, 2))
Select(wait_for_element(driver, By.ID, "equity-type")).select_by_value(equity_type)
time.sleep(random.uniform(1, 2))
Select(wait_for_element(driver, By.ID, "cap-type")).select_by_value(cap_type)
time.sleep(random.uniform(1, 2))
Select(wait_for_element(driver, By.ID, "all-type")).select_by_value(all_type)
time.sleep(random.uniform(1, 2))
# Click "Go" button
wait_for_element(driver, By.ID, "go-button").click()
# Wait for table to load
table = wait_for_element(driver, By.ID, "fund-table", timeout=15)
# Extract table data
df = pd.read_html(table.get_attribute('outerHTML'))[0]
# Save to CSV
df.to_csv(filename, index=False)
print(f"Saved data to {filename}")
# Set up Selenium WebDriver
driver = webdriver.Chrome() # Make sure you have chromedriver installed and in PATH
driver.get("https://www.amfiindia.com/research-information/other-data/mf-scheme-performance-details") # Replace with actual URL
# Wait for initial page load
wait_for_element(driver, By.ID, "end-type", timeout=30)
print("Page loaded successfully")
# Define options for each dropdown
end_types = ["1", "2"] # Open-ended, Closed-end
equity_types = ["1", "2", "3", "4", "5", "6"] # Replace with actual values
cap_types = ["1", "2", "3", "4"] # Replace with actual values
all_types = ["1", "2", "3", "4"] # Replace with actual values
# Iterate through combinations
for end in end_types:
for equity in equity_types:
for cap in cap_types:
for all_type in all_types:
filename = f"fund_data_{end}_{equity}_{cap}_{all_type}.csv"
try:
scrape_and_save(driver, end, equity, cap, all_type, filename)
time.sleep(random.uniform(3, 5)) # Random wait between 3 to 5 seconds
except Exception as e:
print(f"Error scraping combination {end}_{equity}_{cap}_{all_type}: {str(e)}")
driver.quit()
Your target app loads the table page from this app https://www.valueresearchonline.com/amfi
via Iframe, in this case, we can easily extract those data with the iframe page https://www.valueresearchonline.com/amfi
using bs4
, here is the sample code with bs4
(bs4 is faster than selenium in this case):
import requests
from bs4 import BeautifulSoup
import pandas as pd
def fetchValue(primary_category, category, file):
fund_name = []
fund_benchmark = []
riskometer_scheme = []
riskometer_benchmark = []
latest_nav_regular = []
latest_nav_direct = []
five_year_return_regular = []
five_year_return_direct = []
five_year_return_benchmark = []
daily_aum_cr = []
url = f'https://www.valueresearchonline.com/amfi/fund-performance-data/?end-type=1&primary-category={primary_category}&category={category}&amc=ALL&nav-date=25-Oct-2024'
resp = requests.get(url,headers={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0","Referer":"https://www.valueresearchonline.com/amfi/fund-performance"}).text
soup = BeautifulSoup(resp, 'lxml')
table = soup.findAll('tr')
for i in table:
try:
content = i.findAll('td')
fund_name.append(content[0].text.strip())
fund_benchmark.append(content[1].text)
riskometer_scheme.append(content[2].text)
riskometer_benchmark.append(content[3].text)
latest_nav_regular.append(content[4].text.strip())
latest_nav_direct.append(content[5].text.strip())
five_year_return_regular.append(content[6].text.strip())
five_year_return_direct.append(content[7].text.strip())
five_year_return_benchmark.append(content[8].text.strip())
daily_aum_cr.append(content[10].text.strip())
except Exception:
pass
a = {
"Scheme": fund_name,
"Benchmark": fund_benchmark,
"Riskometer_Scheme": riskometer_scheme,
"Riskometer_Benchmark": riskometer_benchmark,
"Lates_Nav_Regular": latest_nav_regular,
"Lates_Nav_Direct": latest_nav_direct,
"Five_Year_Retrun_Regular": five_year_return_regular,
"Five_Year_Retrun_Direct": five_year_return_direct,
"Five_Year_Retrun_Benchmark": five_year_return_benchmark,
"Daily_AUM": daily_aum_cr
}
df = pd.DataFrame(a)
df.to_csv(file, index=False)
url = "https://www.valueresearchonline.com/amfi/fund-performance"
resp = requests.get(url, headers={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0","Referer":"https://www.amfiindia.com/"}).text
soup = BeautifulSoup(resp, 'lxml')
category_list = soup.find('select', id='category')
for i in range(40):# has 40 category combinations on table 2-3
category = category_list.findAll('option')[i]['value']
primary_category = category.split('_')[0]
fetchValue(primary_category, category, f'{category}.csv')
I tried as basic as possible with my code for better understanding