I have a simple Flask app that fetches articles from a handful of RSS feeds. This is the code:
import requests
import xml.etree.ElementTree as ET
from dateutil import parser
import re
from feeds_config import FEEDS
def extract_image_from_content(content):
"""Extract the first image URL from the content using regex."""
match = re.search(r'<img[^>]+src="([^">]+)"', content)
return match.group(1) if match else None
def fetch_articles():
"""
Fetch articles from the feeds listed in FEEDS configuration.
For each feed, parse the RSS feed and extract relevant article details.
"""
articles = []
for feed in FEEDS:
response =requests.get(feed['url']) # Fetch the RSS feed
if response.status_code == 200:
root = ET.fromstring(response.content) # Parse XML content
ns = feed.get('image_ns', {}) # Get namespaces for images
content_ns = feed.get('content_ns', {}) # Get namespaces for content
# List to temporarily store feed articles
feed_articles = []
# Iterate through each item (article) in the feed
for item in root.findall(".//item"):
title = item.find("title").text # Extract article title
link = item.find("link").text # Extract article link
pub_date = item.find("pubDate").text # Extract publication date
timestamp = parser.parse(pub_date) # Parse date to a datetime object
# Extract image URL from <enclosure> or other image tags if available
image = item.find(feed.get('image_xpath', '.'), namespaces=ns)
image_url = image.get("url") if image is not None else None
# If no image found, attempt to extract it from content
if not image_url and feed.get('content_xpath'):
content = item.find(feed['content_xpath'], namespaces=content_ns)
content_text = content.text if content is not None else ""
image_url = extract_image_from_content(content_text)
# Append article details to feed_articles list
feed_articles.append({
"title": title,
"link": link,
"timestamp": timestamp,
"source": feed['source'],
"image": image_url,
"source_url": feed['source_url']
})
# Remove duplicate if the first two items have the same title
if len(feed_articles) > 1 and feed_articles[0]['title'] == feed_articles[1]['title']:
feed_articles.pop(0)
# Add the remaining articles to the main articles list
articles.extend(feed_articles)
return articles
I'm running it on these JSON objects:
FEEDS = [
{
'url': 'https://hedgehogreview.com/web-features/feed',
'source': 'Hedgehog Review',
'source_url': 'https://hedgehogreview.com/', # Extracted URL
'image_xpath': './enclosure',
'image_ns': {},
'content_xpath': './content:encoded',
'content_ns': {'content':
'http:/purl.org/rss/1.0/modules/content/'}
},
{
'url': 'https://mcrawford.substack.com/feed',
'source': 'M.B. Crawford Substack',
'source_url': 'https://mcrawford.substack.com', # Extracted URL
'image_xpath': './enclosure',
'image_ns': {}
},
{
'url': 'https://mattdinan.substack.com/feed',
'source': 'Matt Dinan Substack',
'source_url': 'https://mattdinan.substack.com', # Extracted URL
'image_xpath': './enclosure',
'image_ns': {}
}
]
When I run it locally, they all load. When I run it in a free-tier Azure App Service, only Hedgehog Review loads. Is there a reason the code can't pull RSS feeds from Substack when hosted in the cloud?
I've checked that inbound and outbound traffic are allowed (of course they are, since I can access the site and some of the objects are loading). I've verified that all the dependencies are in dependencies.txt and that the deployment worked successfully.
Substack is blocking Azure IPs. Discovered this recently running an instance of FreshRSS in an Azure VM. My hundreds of feeds load fine -- except for any feeds coming from Substack, which gives a 403. Others report the same thing: https://www.reddit.com/r/Substack/comments/1czssm4/substack_is_blocking_my_ip/