syntax-errorspacingcellspacingline-spacing

Is there a way to remove excess spacing in Python code?


My code below gets the street address for each gym, but there is an error in the spacing of the output for the hours that the gym is open. Any ideas of where I went wrong?

import urlparse

from bs4 import BeautifulSoup
from bs4 import Tag
import requests
import time
import csv

sitemap = 'https://www.planetfitness.com/sitemap'
sitemap_content = requests.get(sitemap).content
soup = BeautifulSoup(sitemap_content, 'html.parser')

atags = soup.select('td[class~=club-title] > a[href^="/gyms"]')
links = [atag.get('href') for atag in atags]

with open('gyms.csv', 'w') as gf:
    gymwriter = csv.writer(gf)
    for link in links:
        gymurl = urlparse.urljoin(sitemap, link)
        sitemap_content = requests.get(gymurl).content
        soup = BeautifulSoup(sitemap_content, 'html.parser')
        gymrow = [ gymurl ]

        address_line1 = soup.select('p[class~=address] > span[class~=address-line1]')
        gymrow.append(address_line1[0].text)
        locality = soup.select('p[class~=address] > span[class~=locality]')
        gymrow.append(locality[0].text)
        administrative_area = soup.select('p[class~=address] > span[class~=administrative-area]')
        gymrow.append(administrative_area[0].text)
        postal_code = soup.select('p[class~=address] > span[class~=postal-code]')
        gymrow.append(postal_code[0].text)
        country = soup.select('p[class~=address] > span[class~=country]')
        gymrow.append(country[0].text)

        strongs = soup.select('div > strong')
        for strong in strongs:
            if strong.text == 'Club Hours':
                for sibling in strong.next_siblings:
                    if isinstance(sibling, Tag):
                        hours = sibling.text
                        gymrow.append(hours)
                        break
        print(gymrow)
        gymwriter.writerow(gymrow)
        time.sleep(3)

Thank you for your help!


Solution

  • You want to select the td elements (of class club-title) that contain a elements, and extract the href attribute.

    from bs4 import BeautifulSoup
    from bs4 import Tag
    import requests
    import urllib.parse
    import time
    import csv
    
    sitemap = 'https://www.planetfitness.com/sitemap'
    res = requests.get(sitemap).content
    soup = BeautifulSoup(res, 'html.parser')
    
    # The rows in the table of gyms are formatted like so:
    # <tr>
    # <td class="club-title"><a href="/gyms/albertville-al"><strong>Albertville, AL</strong> <p>5850 US Hwy 431</p></a></td>
    # <td class="club-join"><div class="button"><a href="/gyms/albertville-al/offers" title="Join Albertville, AL">Join Now</a></div></td>
    # </tr>
    
    # This will find all the links to all the gyms.
    atags = soup.select('td[class~=club-title] > a[href^="/gyms"]')
    links = [atag.get('href') for atag in atags]
    
    with open('gyms.csv', 'w') as gf:
        gymwriter = csv.writer(gf)
        for link in links:
            # Follow the link to this gym
            gymurl = urllib.parse.urljoin(sitemap, link)
            res = requests.get(gymurl).content
            soup = BeautifulSoup(res, 'html.parser')
            gymrow = [ gymurl ]
    
            # The address of this gym.
            address_line1 = soup.select('p[class~=address] > span[class~=address-line1]')
            gymrow.append(address_line1[0].text)
            locality = soup.select('p[class~=address] > span[class~=locality]')
            gymrow.append(locality[0].text)
            administrative_area = soup.select('p[class~=address] > span[class~=administrative-area]')
            gymrow.append(administrative_area[0].text)
            postal_code = soup.select('p[class~=address] > span[class~=postal-code]')
            gymrow.append(postal_code[0].text)
            country = soup.select('p[class~=address] > span[class~=country]')
            gymrow.append(country[0].text)
    
            # The hours of this gym.
            strongs = soup.select('div > strong')
            for strong in strongs:
                if strong.text == 'Club Hours':
                    for sibling in strong.next_siblings:
                        if isinstance(sibling, Tag):
                            hours = sibling.text
                            gymrow.append(hours.replace('<br>', '').replace('\n', ', '))
                            break
    
            gymwriter.writerow(gymrow)
            time.sleep(3)
    

    When I run this, I get:

    $ more gyms.csv
    
    https://www.planetfitness.com/gyms/albertville-al,5850 US Hwy 431,Albertville,AL,35950,United States,"Monday-Friday 6am-9pm, Sat
    urday-Sunday 7am-7pm"
    https://www.planetfitness.com/gyms/alexander-city-al,987 Market Place,Alexander City,AL,35010,United States,Convenient hours whe
    n we reopen
    https://www.planetfitness.com/gyms/bessemer-al,528 W Town Plaza,Bessemer,AL,35020,United States,Convenient hours when we reopen
    https://www.planetfitness.com/gyms/birmingham-crestline-al,4500 Montevallo Rd,Birmingham,AL,35210,United States,Convenient hours
     when we reopen
    .
    .
    .