pythonweb-scrapingbeautifulsoupbotsheadless

i want just the contact persosn and fax number as output


from the action on my code Enter the start value for the range of company IDs: 18351 Enter the end value for the range of company IDs: 78351, the output is not what i want

import requests
from bs4 import BeautifulSoup

# Define a function to extract the contact person's name and fax number from the HTML table
def get_contact_info(soup):
    contact_person = ''
    fax_number = ''
    for td in soup.select('td.bla8'):
        if 'Contact:' in td.text:
            contact_person = td.text.split('Contact:')[1].split(',')[0].strip()
        if 'Fax:' in td.text:
            fax_number = td.text.split('Fax:')[1].split(',')[0].strip()
    return contact_person, fax_number

# Define a function to scrape the supplier details page for a given company ID
def scrape_supplier_details(company_id):
    url = f'https://ha.internationaleprocurement.com/search/supplier_details.html?company_id={company_id}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    contact_person, fax_number = get_contact_info(soup)
    return contact_person, fax_number

# Prompt the user to enter the start and end values for the range of company IDs to scrape
start_id = int(input("Enter the start value for the range of company IDs: "))
end_id = int(input("Enter the end value for the range of company IDs: "))

# Scrape the supplier details page for each company ID in the specified range
for company_id in range(start_id, end_id + 1):
    contact_person, fax_number = scrape_supplier_details(company_id)
    print(f"Company ID: {company_id}, Contact: {contact_person}, Fax: {fax_number}")


Solution

  • In this example I use re module to extract the contact persons name and fax number (if any):

    import re
    import requests
    from bs4 import BeautifulSoup
    
    # Define a function to extract the contact person's name and fax number from the HTML table
    def get_contact_info(soup):
        contact, fax = None, None
    
        td = soup.select_one('td:not(:has(td)):-soup-contains("Contact:")')
        if td:
            text = td.get_text(strip=True, separator='\n')
            contact = re.search(r'Contact: ([^\n]+)', text)
            fax = re.search(r'Fax: ([^\n]+)', text)
    
        return contact.group(1) if contact else '', fax.group(1) if fax else ''
    
    # Define a function to scrape the supplier details page for a given company ID
    def scrape_supplier_details(company_id):
        url = f'https://ha.internationaleprocurement.com/search/supplier_details.html?company_id={company_id}'
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        contact_person, fax_number = get_contact_info(soup)
        return contact_person, fax_number
    
    start_id = 18351
    end_id = 78351
    
    # Scrape the supplier details page for each company ID in the specified range
    for company_id in range(start_id, end_id + 1):
        contact_person, fax_number = scrape_supplier_details(company_id)
        print(f"Company ID: {company_id}, Contact: {contact_person}, Fax: {fax_number}")
    

    Prints:

    Company ID: 18351, Contact: , Fax: 
    Company ID: 18352, Contact: , Fax: 
    Company ID: 18353, Contact: Jim Devlin, Fax: 609-252-8015
    Company ID: 18354, Contact: Scott Grumski, Fax: 724-224-6050
    Company ID: 18355, Contact: , Fax: 
    Company ID: 18356, Contact: Anthony Corum, Fax: 302-337-0998
    Company ID: 18357, Contact: Bill Kenyon, Fax: 814-432-5678
    Company ID: 18358, Contact: Sal Austin, Fax: 201-433-4334
    Company ID: 18359, Contact: Raymond Petrarca, Fax: 401-921-5520
    ...and so on.