python html web-scraping beautifulsoup python-requests

How to web scrape thomasnet website to get suppliers information in python

I want to extract supplier information like supplier name, location, annual revenue, year founded, number of employees, product description etc from https://www.thomasnet.com/ for a particular location and category. For example, I want to extract all 201 suppliers information for category "Battery" and location "Southern California".

I am copying the url of each page for category "Battery" and location "Southern California" and getting the supplier information. But is there any way to automate the process such that I will get all the suppliers information if I put the category and location (irrespective of the number of pages for that search)?

This is what I am doing right now.

import requests
import ssl

from bs4 import BeautifulSoup, SoupStrainer

url = 'https://www.thomasnet.com/southern-california/batteries-3510203-1.html'
html_content = requests.get(url).text

# Parse the html content
soup = BeautifulSoup(html_content, "lxml")

supp_lst = soup.find_all( class_ = "profile-card__title" )
for data in supp_lst:
    # Get text from each tag
    print(data.text)
    
supp_location_lst = soup.find_all( class_ = "profile-card__location")
for data in supp_location_lst:
    # Get text from each tag
    print(data.text)

supp_content_lst = soup.find_all( class_ = "profile-card__body profile-card__mobile-view read-more-wrap")
for data in supp_content_lst:
    # Get text from each tag
    print(data.text)

supp_lst = soup.find_all(class_ = "profile-card__supplier-data")
for data in supp_lst:
    # Get text from each tag
    print(data.text)

I am very much new in web scraping. Any help and suggestion will be highly appreciated. TIA.,

Solution

Just use the pagination in URL and last page marker (Sorry we've found no mathes...) to iterate over all pages.

This example script collects supplier names:

import requests
from bs4 import BeautifulSoup


def find_supplier_names() -> list[str]:
    all_supplier_names = []
    i = 1
    while True:
        r = requests.get(
            f'https://www.thomasnet.com/southern-california/batteries-3510203-{i}.html',
            headers={
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                              'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
            },
        )
        if not r.ok or 'Sorry, we found no matches' in r.text:
            break  # no data -> last page was reached in previous iteration

        soup = BeautifulSoup(r.text, "lxml")
        supplier_names_raw = soup.find_all(class_='custom-control-input selectco')
        found_supplier_names_on_page = [x['data-coname'] for x in supplier_names_raw]
        print(f'Found {len(found_supplier_names_on_page)} suppliers on page {i}')
        all_supplier_names.extend(found_supplier_names_on_page)
        i += 1

    return all_supplier_names


if __name__ == '__main__':
    print(find_supplier_names())