pythonpython-3.xweb-scrapingbeautifulsouppython-requests

How to extract particular tags from soup using python?


From below webpages I like to extract data:

https://www.ams.usda.gov/services/enforcement/organic/settlements https://www.ams.usda.gov/services/enforcement/organic/settlements-2023

"03/19/2025" as "date", 
"United Fruit and Produce Co. – St. Louis, Missouri" as "title" and
"United Fruit and Produce Co. (United) withdraws its appeal, waives further appeal rights in this matter and agrees to stop selling product as organic until its organic certification is reinstated by NOP and to pay its total civil penalty within 30 days. If/when reinstated, United agrees to provide on-time responses to all certifier requests for information and detailed documentation required to maintain organic certification; to inform its certifier of operational or product changes with an updated organic system plan; and to maintain organic certificates of all products it handles."  as "text"

I am heading for all the p tags . But the issue is the strong tag has the dates with titles but it lies in some of the p tags for next text , how to extract them properly ?

I tried below code -

import requests
from bs4 import BeautifulSoup
link_url = 'https://www.ams.usda.gov/services/enforcement/organic/settlements'
with requests.get(link_url) as response:
     response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
main_div = soup.find('main',{'id':'main-content'})
div_class = main_div.find('div',{'class':'field field--name-body field--type-text-with-summary field--label-hidden field__item'})
paragraphs = div_class.find_all('p')
print(paragraphs)
# Initialize lists to store the extracted data
strong_tags = []
text_tags = []

# Loop through all the <p> tags in the document
for p_tag in paragraphs:
    strong_tag = p_tag.find('strong')
    
    if strong_tag:
        # Extract the content of the <strong> tag
        strong_text = strong_tag.get_text(strip=True)
        strong_tags.append(strong_text)
        
        # Extract the remaining text in the <p> tag after the <strong> tag
        remaining_text = p_tag.get_text(strip=True).replace(strong_text, "").strip()
        if remaining_text:
            text_tags.append(remaining_text)
    else:
        # If no <strong> tag is found, just extract the full text
        full_text = p_tag.get_text(strip=True)
        if full_text:
            text_tags.append(full_text)

# Output the results
print("Strong Paragraphs:")
for tag in strong_tags:
    print(tag)

print("\nText Paragraphs:")
for tag in text_tags:
    print(tag)

Solution

  • The HTML layout is irregular and therefore it's only possible to parse by inspection (using some external tool) and then applying logic that fits with what's been determined empirically.

    As a starting point I would recommend designing a class that encapsulates each "article".

    Something like this:

    import requests
    from bs4 import BeautifulSoup as BS
    import re
    
    try:
        import lxml
        PARSER = "lxml"
    except ModuleNotFoundError:
        PARSER = "html.parser"
    
    URLS = [
        "https://www.ams.usda.gov/services/enforcement/organic/settlements",
        "https://www.ams.usda.gov/services/enforcement/organic/settlements-2023",
        "https://www.ams.usda.gov/services/enforcement/organic/settlements-2024"
    ]
    PATTERN = re.compile(r"^(\d{1,2}/\d{1,2}/\d{2,4})[:-]\s*(.*)$")
    
    class Article:
        def __init__(self, date, title, text):
            self.date = date.strip()
            self.title = title.strip()
            self.text = text.strip()
        def append(self, s):
            if s:
                if not s[0].isspace():
                    self.text += " "
                self.text += s
        def __str__(self):
            return f"Date={self.date}, Title={self.title}, Text={self.text}"
        __repr__ = __str__
    
    articles = []
    
    with requests.Session() as session:
        for url in URLS:
            with session.get(url) as response:
                response.raise_for_status()
                soup = BS(response.text, PARSER)
                ps = soup.select("#block-mainpagecontent > article > div > div > p")
                date = title = text = ""
                for p in ps[2:]:
                    if s := p.select_one("strong"):
                        if m := PATTERN.match(s.text):
                            date, title = m.groups()
                        s.extract()
                    text = p.text.strip()
                    if articles and text and not date:
                        articles[-1].append(text)
                    elif date and title and text:
                        articles.append(Article(date, title, text))
                        date = title = text = ""
    
    for article in articles:
        print(article)