I'm trying to read an EML file and then pull all URLs within it.
I've got two methods: body_to_text() which gets the body from the EML, with either BytesParser or Soup; and find_links() which takes the body and uses a regex to find the URLs.
I've got it working for most samples I've tried, however when using Soup to parse the non-multipart files, I run into a problem when the sample contains end of line equals signs.
def body_to_text():
with open("email.eml", "rb") as email_file:
email_message = email.message_from_binary_file(email_file)
if email_message.is_multipart():
with open(self.email, 'rb') as fp:
msg = BytesParser(policy=policy.default).parse(fp)
try:
body_text = msg.get_body(preferencelist=('plain')).get_content().strip()
except AttributeError:
print("No body found")
else:
body_text = body_text.replace("\n", "")
if body_text == "":
print("No body found")
else:
self.find_links(body_text)
else:
body_html = email_message.get_payload()
soup = BeautifulSoup(body_html, "lxml")
find_links(soup)
def find_links(scan_text):
WEB_URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
links = re.findall(WEB_URL_REGEX, str(scan_text))
links = list(dict.fromkeys(self.links))
print(f"{len(self.links)} links found")
print(links)
print(body_html) gives
<a href=3D"http://fawper.xyz/corruptly/676197486/trout/gen= eralizing/1683814388/upgather/disjoin" style=3D"-webkit-text-size-adjust:no= ne;text-decoration:none;"> <font style=3D"-webkit-text-size-adjust:none;fon= t-size:15px;
And print(soup) gives
href='3D"http://fawper.xyz/corruptly/676197486/trout/gen=' ne="" style='3D"-webkit-text-size-adjust:no='> <font style='3D"-webkit-text-size-adjust:none;fon=' t-size:15px=""
So then find_links outputs:
'http://fawper.xyz/corruptly/676197486/trout/gen='
When I want it to output:
'http://fawper.xyz/corruptly/676197486/trout/generalizing/1683814388/upgather/disjoin'
I've tried using html.parser and html5lib in place of lxml, but that didn't solve it. Could it be the encoding of the specific email that I'm parsing?
Swapping the soup block with a part of lastchancexi's answer, which used the Email module to get its payload based on the content type, gave me the desired output.
def body_to_text(self):
text = ""
html = ""
with open(self.email, "rb") as email_file:
email_message = email.message_from_binary_file(email_file)
if not email_message.is_multipart():
content_type = email_message.get_content_type()
if content_type == "text/plain":
text += str(email_message.get_payload(decode=True))
self.find_urls(text)
elif content_type == "text/html":
html += str(email_message.get_payload(decode=True))
self.find_urls(html)
else:
with open(self.email, 'rb') as fp:
msg = BytesParser(policy=policy.default).parse(fp)
try:
body_text = msg.get_body(preferencelist=('plain')).get_content().strip()
except AttributeError:
print("No body found")
else:
body_text = body_text.replace("\n", "")
if body_text == "":
print("No body found")
else:
self.find_urls(body_text)