With some help ;) I have managed to scrape titles and content from CNN news website and put this in a .csv file.
Now the list with URLs (which has been extracted with another code) has some bad URLs. The code for this is really simple as it just scans through the website and returns all URLs. Therefore the list has some bad URLs (e.g. http://cnn.com/date/2021-10-17) Rather than searching this list and removing those bad URLs manually I was wondering if this could be resolved by changing my code into skipping bad URLs and continue with the next and so on.
example code:
import csv
from newspaper import Config
from newspaper import Article
from os.path import exists
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
urls = ['https://www.cnn.com/2021/10/25/tech/facebook-papers/index.html', 'http://cnn.com/date/2021-10-17', 'https://www.cnn.com/entertainment/live-news/rust-shooting-alec-baldwin-10-25-21/h_257c62772a2b69cb37db397592971b58']
# the above normally would be where I refer to the .csv file with URLs
for url in urls:
article = Article(url, config=config)
article.download()
article.parse()
article_meta_data = article.meta_data
file_exists = exists('cnn_extraction_results.csv')
if not file_exists:
with open('cnn_extraction_results.csv', 'w', newline='') as file:
headers = ['article title', 'article text']
writer = csv.DictWriter(file, delimiter=',', lineterminator='\n', fieldnames=headers)
writer.writeheader()
writer.writerow({'article title': article.title,
'article text': article.text})
else:
with open('cnn_extraction_results.csv', 'a', newline='') as file:
headers = ['article title', 'article text']
writer = csv.DictWriter(file, delimiter=',', lineterminator='\n', fieldnames=headers)
writer.writerow({'article title': article.title,
'article text': article.text})
Try this:
import csv
from os.path import exists
from newspaper import Config
from newspaper import Article
from newspaper import ArticleException
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
urls = ['https://www.cnn.com/2021/10/25/tech/facebook-papers/index.html',
'http://cnn.com/date/2021-10-17',
'https://www.cnn.com/entertainment/live-news/rust-shooting-alec-baldwin-10-25-21/h_257c62772a2b69cb37db397592971b58']
for url in urls:
try:
article = Article(url, config=config)
article.download()
article.parse()
article_meta_data = article.meta_data
file_exists = exists('cnn_extraction_results.csv')
if not file_exists:
with open('cnn_extraction_results.csv', 'w', newline='') as file:
headers = ['article title', 'article text']
writer = csv.DictWriter(file, delimiter=',', lineterminator='\n', fieldnames=headers)
writer.writeheader()
writer.writerow({'article title': article.title,
'article text': article.text})
else:
with open('cnn_extraction_results.csv', 'a', newline='') as file:
headers = ['article title', 'article text']
writer = csv.DictWriter(file, delimiter=',', lineterminator='\n', fieldnames=headers)
writer.writerow({'article title': article.title,
'article text': article.text})
except ArticleException:
print('***FAILED TO DOWNLOAD***', url)