pythonpython-3.xweb-scrapingbeautifulsouppdf-scraping

How to parse the drop down list and get the all the links for the pdf using Beautiful Soup in Python?


I'm trying to scrape the pdf links from the drop down this website. I want to scrape just the Guideline Values (CVC) drop down. Following is the code that i used but did not succeed

import requests
from bs4 import BeautifulSoup

req_ses = requests.Session()
igr_get_base_response = req_ses.get("https://igr.karnataka.gov.in/english#")

soup = BeautifulSoup(igr_get_base_response.text)

def matches_block(tag):
    return matches_dropdown(tag) and tag.find(matches_text) != None

def matches_dropdown(tag):
    return tag.name == 'li' and tag.has_attr('class') and 'dropdown-toggle' in tag['class']

def matches_text(tag):
    return tag.name == 'a' and tag.get_text()

for li in soup.find_all(matches_block):
    for ul in li.find_all('ul', class_='dropdown-toggle'):
        for a in ul.find_all('a'):
            if a.has_attr('href'):
                print (a['href'])

any suggestion would be great help !

Edit: Adding part of HTML below:

<div class="collapse navbar-collapse">
    <ul class="nav navbar-nav">



        <li class="">
            <a href="https://igr.karnataka.gov.in/english" title="Home" class="shome"><i class="fa fa-home"> </i></a>
        </li>





        <li>
            <a class="dropdown-toggle" data-toggle="dropdown" title="RTI Act">RTI Act <b class="caret"></b></a>
            <ul class="dropdown-menu multi-level">

                <!-- <li> -->
                <li class="">
                    <a href=" https://igr.karnataka.gov.in/page/RTI+Act/Yadagiri+./en " title="Yadagiri .">Yadagiri .
                    </a>

                </li>

                <!-- </li> -->

                <!-- <li> 


Solution

  • So, i used the following approach to complete the above mentioned part:

    def make_sqlite_dict_from_parsed_row(district_value, sro_value, pdf_file_link):
        sqlite_dict = {
            "district_value": district_value,
            "sro_value": sro_value,
            "pdf_file_link": pdf_file_link.strip().replace(' ', '%20'),
            "status": "PENDING"
        }
        sqlite_dict['hsh'] = get_hash(sqlite_dict, IGR_SQLITE_HSH_TUP)
        return sqlite_dict
    
    li_element_list = home_response_soup.find_all('li', {'class': 'dropdown-submenu'})
    parsed_row_list=[]
    
    for ele in li_element_list:
        district_value = ele.find('a', {'class': 'dropdown-toggle'}).get_text().strip()
        sro_pdf_a_tags = ele.find_all('a', attrs={'target': '_blank'})
    
        if len(sro_pdf_a_tags) >=1:
            for sro_a_tag in sro_pdf_a_tags:
                sqlite_dict = make_sqlite_dict_from_parsed_row(
                    district_value,
                    sro_a_tag.get_text(strip=True),
                    sro_a_tag.get('href')
                )
                parsed_row_list.append(sqlite_dict)
                
        else:
            print("District: ", district_value, "'s pdf is corrupted")
    

    this will give a proper_pdf_link, sro_name and disctrict_name