I want to extract supplier information like supplier name, location, annual revenue, year founded, number of employees, product description etc from https://www.thomasnet.com/ for a particular location and category. For example, I want to extract all 201 suppliers information for category "Battery" and location "Southern California".
I am copying the url of each page for category "Battery" and location "Southern California" and getting the supplier information. But is there any way to automate the process such that I will get all the suppliers information if I put the category and location (irrespective of the number of pages for that search)?
This is what I am doing right now.
import requests
import ssl
from bs4 import BeautifulSoup, SoupStrainer
url = 'https://www.thomasnet.com/southern-california/batteries-3510203-1.html'
html_content = requests.get(url).text
# Parse the html content
soup = BeautifulSoup(html_content, "lxml")
supp_lst = soup.find_all( class_ = "profile-card__title" )
for data in supp_lst:
# Get text from each tag
print(data.text)
supp_location_lst = soup.find_all( class_ = "profile-card__location")
for data in supp_location_lst:
# Get text from each tag
print(data.text)
supp_content_lst = soup.find_all( class_ = "profile-card__body profile-card__mobile-view read-more-wrap")
for data in supp_content_lst:
# Get text from each tag
print(data.text)
supp_lst = soup.find_all(class_ = "profile-card__supplier-data")
for data in supp_lst:
# Get text from each tag
print(data.text)
I am very much new in web scraping. Any help and suggestion will be highly appreciated. TIA.,
Just use the pagination in URL and last page marker (Sorry we've found no mathes...) to iterate over all pages.
This example script collects supplier names:
import requests
from bs4 import BeautifulSoup
def find_supplier_names() -> list[str]:
all_supplier_names = []
i = 1
while True:
r = requests.get(
f'https://www.thomasnet.com/southern-california/batteries-3510203-{i}.html',
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
},
)
if not r.ok or 'Sorry, we found no matches' in r.text:
break # no data -> last page was reached in previous iteration
soup = BeautifulSoup(r.text, "lxml")
supplier_names_raw = soup.find_all(class_='custom-control-input selectco')
found_supplier_names_on_page = [x['data-coname'] for x in supplier_names_raw]
print(f'Found {len(found_supplier_names_on_page)} suppliers on page {i}')
all_supplier_names.extend(found_supplier_names_on_page)
i += 1
return all_supplier_names
if __name__ == '__main__':
print(find_supplier_names())