I have the following soup:
<a href="some_url">next</a>
<span class="class">...</span>
From this I want to extract the href, "some_url" This I want to extract the href, "some_url"
and the whole list of the pages that are listed on this page: https://www.catholic-hierarchy.org/diocese/laa.html
note: there are a whole lot of links to sub-pages: which i need to parse. at the moment: getting all the data out it : -dioceses -Urls -description -contact-data -etc. etx.
The example below will grab all URLs of dioceses, get some info about each of them and creates final dataframe. To speed-up the process multiprocessing.Pool is used:
but wait: how to get this scraper running without the support of the multiprocessing!? i want to run it in Colab - therefore in need to get rid of the multiprocessing-feature.
How to achieve this..!?
import requests
from bs4 import BeautifulSoup
from multiprocessing import Pool
def get_dioceses_urls(section_url):
dioceses_urls = set()
while True:
print(section_url)
soup = BeautifulSoup(
requests.get(section_url, headers=headers).content, "lxml"
)
for a in soup.select('ul a[href^="d"]'):
dioceses_urls.add(
"https://www.catholic-hierarchy.org/diocese/" + a["href"]
)
# is there Next Page button?
next_page = soup.select_one('a:has(img[alt="[Next Page]"])')
if next_page:
section_url = (
"https://www.catholic-hierarchy.org/diocese/"
+ next_page["href"]
)
else:
break
return dioceses_urls
def get_diocese_info(url):
print(url)
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html5lib")
data = {
"Title 1": soup.h1.get_text(strip=True),
"Title 2": soup.h2.get_text(strip=True),
"Title 3": soup.h3.get_text(strip=True) if soup.h3 else "-",
"URL": url,
}
li = soup.find(
lambda tag: tag.name == "li"
and "type of jurisdiction:" in tag.text.lower()
and tag.find() is None
)
if li:
for l in li.find_previous("ul").find_all("li"):
t = l.get_text(strip=True, separator=" ")
if ":" in t:
k, v = t.split(":", maxsplit=1)
data[k.strip()] = v.strip()
# get other info about the diocese
# ...
return data
if __name__ == "__main__":
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0"
}
# get main sections:
url = "https://www.catholic-hierarchy.org/diocese/laa.html"
soup = BeautifulSoup(
requests.get(url, headers=headers).content, "html.parser"
)
main_sections = [url]
for a in soup.select("a[target='_parent']"):
main_sections.append(
"https://www.catholic-hierarchy.org/diocese/" + a["href"]
)
all_data, dioceses_urls = [], set()
with Pool() as pool:
# get all dioceses urls:
for urls in pool.imap_unordered(get_dioceses_urls, main_sections):
dioceses_urls.update(urls)
# get info about all dioceses:
for info in pool.imap_unordered(get_diocese_info, dioceses_urls):
all_data.append(info)
# create dataframe from the info about dioceses
df = pd.DataFrame(all_data).sort_values("Title 1")
# save it to csv file
df.to_csv("data.csv", index=False)
print(df.head().to_markdown())
update: well see what i get back if i run the script on colab:
https://www.catholic-hierarchy.org/diocese/laa.htmlhttps://www.catholic-hierarchy.org/diocese/lab.html
---------------------------------------------------------------------------
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
"""
Traceback (most recent call last):
File "/usr/lib/python3.7/multiprocessing/pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "<ipython-input-1-f5ea34a0190f>", line 21, in get_dioceses_urls
next_page = soup.select_one('a:has(img[alt="[Next Page]"])')
File "/usr/local/lib/python3.7/dist-packages/bs4/element.py", line 1403, in select_one
value = self.select(selector, limit=1)
File "/usr/local/lib/python3.7/dist-packages/bs4/element.py", line 1528, in select
'Only the following pseudo-classes are implemented: nth-of-type.')
NotImplementedError: Only the following pseudo-classes are implemented: nth-of-type.
"""
The above exception was the direct cause of the following exception:
NotImplementedError Traceback (most recent call last)
<ipython-input-1-f5ea34a0190f> in <module>
81 with Pool() as pool:
82 # get all dioceses urls:
---> 83 for urls in pool.imap_unordered(get_dioceses_urls, main_sections):
84 dioceses_urls.update(urls)
85
/usr/lib/python3.7/multiprocessing/pool.py in next(self, timeout)
746 if success:
747 return value
--> 748 raise value
749
750 __next__ = next # XXX
NotImplementedError: Only the following pseudo-classes are implemented: nth-of-type.
problem with running script on google colab is that it currently only supports python 3.7, which doesn't support the newest version of beautifulsoup, so your a:has
operator is not supported, i have replaced it with a loop on all a
tags, which is slightly slower but the code works on google colab, and there is no need to remove multprocessing, but if you do need to remove multiprocessing then you should convert your functions into corountines and run them as tasks using asyncio as suggested by @Barry the Platipus.
def get_dioceses_urls(section_url):
dioceses_urls = set()
while True:
print(section_url)
soup = BeautifulSoup(
requests.get(section_url, headers=headers).content, "lxml"
)
for a in soup.select('ul a[href^="d"]'):
dioceses_urls.add(
"https://www.catholic-hierarchy.org/diocese/" + a["href"]
)
# is there Next Page button?
next_page = None
for a in soup.find_all('a'):
if a.img:
if a.img["alt"] == "[Next Page]":
next_page = a
break
if next_page:
section_url = (
"https://www.catholic-hierarchy.org/diocese/"
+ next_page["href"]
)
else:
break
return dioceses_urls