I am working on scraping data from several infographics on ridership data for Amtrak. I want to collect the yearly ridership #s and addresses of each station in the US.
Here is my code for one station:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import fitz
url_abe = 'https://www.railpassengers.org/site/assets/files/1679/abe.pdf'
page = requests.get(url_abe)
# Save the PDF file locally
pdf_path = 'abe.pdf'
with open(pdf_path, 'wb') as file:
file.write(page.content)
# Step 2: Extract text from the PDF file
def extract_text_from_pdf(pdf_path):
# Open the PDF file
document = fitz.open(pdf_path)
# Iterate through each page and extract text
text = ''
for page_num in range(len(document)):
page = document.load_page(page_num)
text += page.get_text()
return text
# Get the extracted text
pdf_text = extract_text_from_pdf(pdf_path)
How can I then parse this out to get a pandas dataframe that looks like the following:
2016 2017 2018 2019 2020 2021 2022 Address
37161 37045 37867 39108 19743 14180 34040 18 E Bel Air Ave Aberdeen, MD 21001-3701
I figured it out:
def extract_all_ridership_numbers(text):
pattern = re.compile(r'\b\d{2},\d{3}\b')
# Find all matches in the text
matches = pattern.findall(text)
# Return all matches
return matches
# Extract all ridership numbers
ridership_numbers = extract_all_ridership_numbers(pdf_text)
# Print the extracted ridership numbers
print('Extracted Ridership Numbers:', ridership_numbers)
df = pd.DataFrame(ridership_numbers, columns=['Ridership Numbers'])[0:7]
df['Year'] = [2016,2017,2018,2019,2020,2021,2022]