pythonweb-scrapingdirectoryformatclean-urls

Saving / Editing Scrapped URLs to Directory


I have successfully scrapped links from a website and I want to save them to a local folder already created called "HerHoops" for parsing later. In the past, I have successfully done this, but this website's links need a little more cleaning up.

So far this is my code. I want to keep everything after "box_score" in the link so that the saved filename includes the date and teams playing. Also saved in write mode ("w+").

url = f"https://herhoopstats.com/stats/wnba/schedule_date/2004/6/1/"
data = requests.get(url)
soup = BeautifulSoup(data.text)
matchup_table = soup.find_all("div", {"class": "schedule"})[0]

links = matchup_table.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if '/box_score/' in l]

box_scores_urls = [f"https://herhoopstats.com{l}" for l in links]

for box_scores_url in box_scores_urls:
      data = requests.get(box_scores_url)
      # within loop opening up page and saving to folder in write mode
      with open("HerHoops/{}".format(box_scores_url[46:]), "w+") as f:
         # write to the files
         f.write(data.text) 
      time.sleep(3)

The error is

FileNotFoundError: [Errno 2] No such file or directory: 'HerHoops/2004/06/01/new-york-liberty-vs-charlotte-sting/'

Solution

  • From the error itself its clear that you are trying to write to the file 'HerHoops/2004/06/01/new-york-liberty-vs-charlotte-sting/', but part of the directory does not exist We can create the necessary directories by using the os.makedirs() function before writing to the file

    Full code

    import os
    import time
    import requests
    from bs4 import BeautifulSoup
    import re
    from datetime import datetime
    
    url = f"https://herhoopstats.com/stats/wnba/schedule_date/2004/6/1/"
    data = requests.get(url)
    soup = BeautifulSoup(data.text)
    matchup_table = soup.find_all("div", {"class": "schedule"})[0]
    
    links = matchup_table.find_all('a')
    links = [l.get("href") for l in links]
    links = [l for l in links if '/box_score/' in l]
    
    box_scores_urls = [f"https://herhoopstats.com{l}" for l in links]
    
    for box_scores_url in box_scores_urls:
        data = requests.get(box_scores_url)
        # extract date and teams from the box_scores_url
        date_str = datetime.strptime(re.sub(r'\D', '', url), "%Y%m%d").strftime("%Y-%m-%d")
        # Get the latter part of the string using slicing
        match = re.search(r'\d+(?!.*\d)', box_scores_url.replace('/', ''))
        teams_str = box_scores_url.replace('/', '')[match.end():]
        # create the directory if it doesn't exist
        directory = f"HerHoops/"
        os.makedirs(directory, exist_ok=True)
        # within loop opening up page and saving to folder in write mode
        with open(f"{directory}{date_str}-{teams_str}.html", "w+") as f:
            # write to the file
            f.write(data.text)
        time.sleep(3)