My code below gets the street address for each gym, but there is an error in the spacing of the output for the hours that the gym is open. Any ideas of where I went wrong?
import urlparse
from bs4 import BeautifulSoup
from bs4 import Tag
import requests
import time
import csv
sitemap = 'https://www.planetfitness.com/sitemap'
sitemap_content = requests.get(sitemap).content
soup = BeautifulSoup(sitemap_content, 'html.parser')
atags = soup.select('td[class~=club-title] > a[href^="/gyms"]')
links = [atag.get('href') for atag in atags]
with open('gyms.csv', 'w') as gf:
gymwriter = csv.writer(gf)
for link in links:
gymurl = urlparse.urljoin(sitemap, link)
sitemap_content = requests.get(gymurl).content
soup = BeautifulSoup(sitemap_content, 'html.parser')
gymrow = [ gymurl ]
address_line1 = soup.select('p[class~=address] > span[class~=address-line1]')
gymrow.append(address_line1[0].text)
locality = soup.select('p[class~=address] > span[class~=locality]')
gymrow.append(locality[0].text)
administrative_area = soup.select('p[class~=address] > span[class~=administrative-area]')
gymrow.append(administrative_area[0].text)
postal_code = soup.select('p[class~=address] > span[class~=postal-code]')
gymrow.append(postal_code[0].text)
country = soup.select('p[class~=address] > span[class~=country]')
gymrow.append(country[0].text)
strongs = soup.select('div > strong')
for strong in strongs:
if strong.text == 'Club Hours':
for sibling in strong.next_siblings:
if isinstance(sibling, Tag):
hours = sibling.text
gymrow.append(hours)
break
print(gymrow)
gymwriter.writerow(gymrow)
time.sleep(3)
Thank you for your help!
You want to select the td
elements (of class club-title
) that contain a
elements, and extract the href
attribute.
from bs4 import BeautifulSoup
from bs4 import Tag
import requests
import urllib.parse
import time
import csv
sitemap = 'https://www.planetfitness.com/sitemap'
res = requests.get(sitemap).content
soup = BeautifulSoup(res, 'html.parser')
# The rows in the table of gyms are formatted like so:
# <tr>
# <td class="club-title"><a href="/gyms/albertville-al"><strong>Albertville, AL</strong> <p>5850 US Hwy 431</p></a></td>
# <td class="club-join"><div class="button"><a href="/gyms/albertville-al/offers" title="Join Albertville, AL">Join Now</a></div></td>
# </tr>
# This will find all the links to all the gyms.
atags = soup.select('td[class~=club-title] > a[href^="/gyms"]')
links = [atag.get('href') for atag in atags]
with open('gyms.csv', 'w') as gf:
gymwriter = csv.writer(gf)
for link in links:
# Follow the link to this gym
gymurl = urllib.parse.urljoin(sitemap, link)
res = requests.get(gymurl).content
soup = BeautifulSoup(res, 'html.parser')
gymrow = [ gymurl ]
# The address of this gym.
address_line1 = soup.select('p[class~=address] > span[class~=address-line1]')
gymrow.append(address_line1[0].text)
locality = soup.select('p[class~=address] > span[class~=locality]')
gymrow.append(locality[0].text)
administrative_area = soup.select('p[class~=address] > span[class~=administrative-area]')
gymrow.append(administrative_area[0].text)
postal_code = soup.select('p[class~=address] > span[class~=postal-code]')
gymrow.append(postal_code[0].text)
country = soup.select('p[class~=address] > span[class~=country]')
gymrow.append(country[0].text)
# The hours of this gym.
strongs = soup.select('div > strong')
for strong in strongs:
if strong.text == 'Club Hours':
for sibling in strong.next_siblings:
if isinstance(sibling, Tag):
hours = sibling.text
gymrow.append(hours.replace('<br>', '').replace('\n', ', '))
break
gymwriter.writerow(gymrow)
time.sleep(3)
When I run this, I get:
$ more gyms.csv
https://www.planetfitness.com/gyms/albertville-al,5850 US Hwy 431,Albertville,AL,35950,United States,"Monday-Friday 6am-9pm, Sat
urday-Sunday 7am-7pm"
https://www.planetfitness.com/gyms/alexander-city-al,987 Market Place,Alexander City,AL,35010,United States,Convenient hours whe
n we reopen
https://www.planetfitness.com/gyms/bessemer-al,528 W Town Plaza,Bessemer,AL,35020,United States,Convenient hours when we reopen
https://www.planetfitness.com/gyms/birmingham-crestline-al,4500 Montevallo Rd,Birmingham,AL,35210,United States,Convenient hours
when we reopen
.
.
.