I have a simple Flask app that fetches articles from a handful of RSS feeds. This is the code:
import requests
import xml.etree.ElementTree as ET
from dateutil import parser
import re
from feeds_config import FEEDS
def extract_image_from_content(content):
"""Extract the first image URL from the content using regex."""
match = re.search(r'<img[^>]+src="([^">]+)"', content)
return match.group(1) if match else None
def fetch_articles():
"""
Fetch articles from the feeds listed in FEEDS configuration.
For each feed, parse the RSS feed and extract relevant article details.
"""
articles = []
for feed in FEEDS:
response =requests.get(feed['url']) # Fetch the RSS feed
if response.status_code == 200:
root = ET.fromstring(response.content) # Parse XML content
ns = feed.get('image_ns', {}) # Get namespaces for images
content_ns = feed.get('content_ns', {}) # Get namespaces for content
# List to temporarily store feed articles
feed_articles = []
# Iterate through each item (article) in the feed
for item in root.findall(".//item"):
title = item.find("title").text # Extract article title
link = item.find("link").text # Extract article link
pub_date = item.find("pubDate").text # Extract publication date
timestamp = parser.parse(pub_date) # Parse date to a datetime object
# Extract image URL from <enclosure> or other image tags if available
image = item.find(feed.get('image_xpath', '.'), namespaces=ns)
image_url = image.get("url") if image is not None else None
# If no image found, attempt to extract it from content
if not image_url and feed.get('content_xpath'):
content = item.find(feed['content_xpath'], namespaces=content_ns)
content_text = content.text if content is not None else ""
image_url = extract_image_from_content(content_text)
# Append article details to feed_articles list
feed_articles.append({
"title": title,
"link": link,
"timestamp": timestamp,
"source": feed['source'],
"image": image_url,
"source_url": feed['source_url']
})
# Remove duplicate if the first two items have the same title
if len(feed_articles) > 1 and feed_articles[0]['title'] == feed_articles[1]['title']:
feed_articles.pop(0)
# Add the remaining articles to the main articles list
articles.extend(feed_articles)
return articles
I'm running it on these JSON objects:
FEEDS = [
{
'url': 'https://hedgehogreview.com/web-features/feed',
'source': 'Hedgehog Review',
'source_url': 'https://hedgehogreview.com/', # Extracted URL
'image_xpath': './enclosure',
'image_ns': {},
'content_xpath': './content:encoded',
'content_ns': {'content':
'http:/purl.org/rss/1.0/modules/content/'}
},
{
'url': 'https://mcrawford.substack.com/feed',
'source': 'M.B. Crawford Substack',
'source_url': 'https://mcrawford.substack.com', # Extracted URL
'image_xpath': './enclosure',
'image_ns': {}
},
{
'url': 'https://mattdinan.substack.com/feed',
'source': 'Matt Dinan Substack',
'source_url': 'https://mattdinan.substack.com', # Extracted URL
'image_xpath': './enclosure',
'image_ns': {}
}
]
When I run it locally, they all load. When I run it in a free-tier Azure App Service, only Hedgehog Review loads. Is there a reason the code can't pull RSS feeds from Substack when hosted in the cloud?
I've checked that inbound and outbound traffic are allowed (of course they are, since I can access the site and some of the objects are loading). I've verified that all the dependencies are in dependencies.txt and that the deployment worked successfully.
The Other two URLs are blocking/unauthorizing the request from Azure.
I tried your code and I am also facing the similar issue. locally all three URLs are given data in response but in Azure only 'https://hedgehogreview.com/web-features/feed',
is giving response.
No error logs are thrown during run.
I tried curl method to find the response from the URLs to check the response from each environment.
I don't think its possible to resolve the error unless the URLs authorize the request of the azure environment.
OUTPUT
:In Locally all urls' response 200 OK
.
Azure :
In Azure other two urls response is 403
.