pythonweb-scrapingnewspaper3k

Newspaper3k filter out bad URL while extracting


With some help ;) I have managed to scrape titles and content from CNN news website and put this in a .csv file.

Now the list with URLs (which has been extracted with another code) has some bad URLs. The code for this is really simple as it just scans through the website and returns all URLs. Therefore the list has some bad URLs (e.g. http://cnn.com/date/2021-10-17) Rather than searching this list and removing those bad URLs manually I was wondering if this could be resolved by changing my code into skipping bad URLs and continue with the next and so on.

example code:

import csv
from newspaper import Config
from newspaper import Article
from os.path import exists

USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'

config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10

urls = ['https://www.cnn.com/2021/10/25/tech/facebook-papers/index.html', 'http://cnn.com/date/2021-10-17', 'https://www.cnn.com/entertainment/live-news/rust-shooting-alec-baldwin-10-25-21/h_257c62772a2b69cb37db397592971b58']
# the above normally would be where I refer to the .csv file with URLs
for url in urls:
    article = Article(url, config=config)
    article.download()
    article.parse()
    article_meta_data = article.meta_data

    file_exists = exists('cnn_extraction_results.csv')
    if not file_exists:
        with open('cnn_extraction_results.csv', 'w', newline='') as file:
            headers = ['article title', 'article text']
            writer = csv.DictWriter(file, delimiter=',', lineterminator='\n', fieldnames=headers)
            writer.writeheader()
            writer.writerow({'article title': article.title,
                             'article text': article.text})
    else:
        with open('cnn_extraction_results.csv', 'a', newline='') as file:
            headers = ['article title', 'article text']
            writer = csv.DictWriter(file, delimiter=',', lineterminator='\n', fieldnames=headers)
            writer.writerow({'article title': article.title,
                             'article text': article.text})

Solution

  • Try this:

    import csv
    from os.path import exists
    from newspaper import Config
    from newspaper import Article
    from newspaper import ArticleException
    
    USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
    
    config = Config()
    config.browser_user_agent = USER_AGENT
    config.request_timeout = 10
    
    urls = ['https://www.cnn.com/2021/10/25/tech/facebook-papers/index.html',
            'http://cnn.com/date/2021-10-17',
            'https://www.cnn.com/entertainment/live-news/rust-shooting-alec-baldwin-10-25-21/h_257c62772a2b69cb37db397592971b58']
    
    for url in urls:
        try:
            article = Article(url, config=config)
            article.download()
            article.parse()
            article_meta_data = article.meta_data
    
            file_exists = exists('cnn_extraction_results.csv')
            if not file_exists:
                with open('cnn_extraction_results.csv', 'w', newline='') as file:
                    headers = ['article title', 'article text']
                    writer = csv.DictWriter(file, delimiter=',', lineterminator='\n', fieldnames=headers)
                    writer.writeheader()
                    writer.writerow({'article title': article.title,
                                     'article text': article.text})
            else:
                with open('cnn_extraction_results.csv', 'a', newline='') as file:
                    headers = ['article title', 'article text']
                    writer = csv.DictWriter(file, delimiter=',', lineterminator='\n', fieldnames=headers)
                    writer.writerow({'article title': article.title,
                                     'article text': article.text})
        except ArticleException:
            print('***FAILED TO DOWNLOAD***', url)