I wrote a script to extract published dates from news articles. I have all the urls to these articles in a text file (one url per line). The goal is to group the articles by date (one file for each day and it has all news stories published in that day). The script runs fine but it takes a very long time (sometimes weeks). There are about 300k news articles' urls in each text file. the script eventually stops and gives this error
Traceback (most recent call last):
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\urllib3\connectionpool.py", line 703, in urlopen
httplib_response = self._make_request(
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\urllib3\connectionpool.py", line 386, in _make_request
self._validate_conn(conn)
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\urllib3\connectionpool.py", line 1042, in _validate_conn
conn.connect()
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\urllib3\connection.py", line 414, in connect
self.sock = ssl_wrap_socket(
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\urllib3\util\ssl_.py", line 449, in ssl_wrap_socket
ssl_sock = _ssl_wrap_socket_impl(
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\urllib3\util\ssl_.py", line 493, in _ssl_wrap_socket_impl
return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.2288.0_x64__qbz5n2kfra8p0\lib\ssl.py", line 513, in wrap_socket
return self.sslsocket_class._create(
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.2288.0_x64__qbz5n2kfra8p0\lib\ssl.py", line 1071, in _create
self.do_handshake()
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.2288.0_x64__qbz5n2kfra8p0\lib\ssl.py", line 1342, in do_handshake
self._sslobj.do_handshake()
ConnectionAbortedError: [WinError 10053] An established connection was aborted by the software in your host machine
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\requests\adapters.py", line 489, in send
resp = conn.urlopen(
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\urllib3\connectionpool.py", line 787, in urlopen
retries = retries.increment(
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\urllib3\util\retry.py", line 550, in increment
raise six.reraise(type(error), error, _stacktrace)
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\urllib3\packages\six.py", line 769, in reraise
raise value.with_traceback(tb)
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\urllib3\connectionpool.py", line 703, in urlopen
httplib_response = self._make_request(
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\urllib3\connectionpool.py", line 386, in _make_request
self._validate_conn(conn)
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\urllib3\connectionpool.py", line 1042, in _validate_conn
conn.connect()
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\urllib3\connection.py", line 414, in connect
self.sock = ssl_wrap_socket(
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\urllib3\util\ssl_.py", line 449, in ssl_wrap_socket
ssl_sock = _ssl_wrap_socket_impl(
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\urllib3\util\ssl_.py", line 493, in _ssl_wrap_socket_impl
return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.2288.0_x64__qbz5n2kfra8p0\lib\ssl.py", line 513, in wrap_socket
return self.sslsocket_class._create(
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.2288.0_x64__qbz5n2kfra8p0\lib\ssl.py", line 1071, in _create
self.do_handshake()
File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.2288.0_x64__qbz5n2kfra8p0\lib\ssl.py", line 1342, in do_handshake
self._sslobj.do_handshake()
urllib3.exceptions.ProtocolError: ('Connection aborted.', ConnectionAbortedError(10053, 'An established connection was aborted by the software in your host machine', None, 10053, None))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\hhallak\Desktop\split_by_date.py", line 65, in <module>
split(links)
File "C:\Users\hhallak\Desktop\split_by_date.py", line 25, in split
with requests.get(link, stream=True) as response:
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\requests\api.py", line 73, in get
return request("get", url, params=params, **kwargs)
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\requests\api.py", line 59, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\requests\sessions.py", line 587, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\requests\sessions.py", line 701, in send
r = adapter.send(request, **kwargs)
File "C:\Users\hhallak\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\requests\adapters.py", line 547, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionAbortedError(10053, 'An established connection was aborted by the software in your host machine', None, 10053, None))
This is the code I am running:
import os
import sys
from newspaper import Article
import requests
import json
def split(links):
exists = os.path.exists("output")
if not exists:
# Create a new directory because it does not exist
os.makedirs("output")
exists = os.path.exists("output_redo")
if not exists:
# Create a new directory because it does not exist
os.makedirs("output_redo")
for link in links:
with requests.get(link, stream=True) as response:
response = requests.get(link)
if response.status_code == 200:
final_link = link
else:
final_link = response.url
try:
story = Article(final_link)
story.download()
story.parse()
date_time = str(story.publish_date)
split_date = date_time.split()
date = split_date[0]
with open("output/" + date + ".txt", "a", encoding = 'utf-8') as output_file:
output_file.write(link + "\n")
except:
print("The script was not able to extract published date. Moving the url to be crawled later.")
print("link: ", link)
print("status code: ", response.status_code)
print("final link: ", final_link)
with open("output_redo/" + "links_to_redo" + ".txt", "a", encoding = 'utf-8') as output_redo:
output_redo.write(link + "\n")
continue
if __name__ == "__main__":
if len(sys.argv) != 2:
print ("Usage: Python split_by_date.py <file_name>")
print ("e.g: python split_by_date.py input_file.txt")
sys.exit()
else:
file_name = sys.argv[1]
with open(file_name, "r", encoding = 'utf-8') as input_file:
input_data = input_file.read()
links = input_data.split("\n")
del links[-1]
split(links)
I figured it out by disabling the firewall. I'm not sure if this is it because I don't get this error for a small input file (under 10 MB of urls in text format). Another issue appeared which is "Connection aborted, [Errno 10054] An existing connection was forcibly closed by the remote host. I think this one was because I was making too many requests to the server. I have solved this issue so far by including time.sleep(5) to wait 5 seconds between requests. The reason I did not use threading is that with threading, I will be making even more requests and will sure get blocked by the server.