pythonbeautifulsoupeml

Python - How to Pull URLs From EML Files with BeautifulSoup


I'm trying to read an EML file and then pull all URLs within it.

I've got two methods: body_to_text() which gets the body from the EML, with either BytesParser or Soup; and find_links() which takes the body and uses a regex to find the URLs.

I've got it working for most samples I've tried, however when using Soup to parse the non-multipart files, I run into a problem when the sample contains end of line equals signs.

def body_to_text():
        with open("email.eml", "rb") as email_file:
            email_message = email.message_from_binary_file(email_file)

        if email_message.is_multipart():
            with open(self.email, 'rb') as fp:
                msg = BytesParser(policy=policy.default).parse(fp)

            try:
                body_text = msg.get_body(preferencelist=('plain')).get_content().strip()
            except AttributeError:
                print("No body found")
            else:
                body_text = body_text.replace("\n", "")
                
                if body_text == "":
                    print("No body found")
                else:
                    self.find_links(body_text)

        else:            
            body_html = email_message.get_payload()
            soup = BeautifulSoup(body_html, "lxml")
            find_links(soup)

def find_links(scan_text):
        WEB_URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
        links = re.findall(WEB_URL_REGEX, str(scan_text))

        links = list(dict.fromkeys(self.links))

        print(f"{len(self.links)} links found")
        print(links)

print(body_html) gives

<a href=3D"http://fawper.xyz/corruptly/676197486/trout/gen=
eralizing/1683814388/upgather/disjoin" style=3D"-webkit-text-size-adjust:no=
ne;text-decoration:none;"> <font style=3D"-webkit-text-size-adjust:none;fon=
t-size:15px;

And print(soup) gives

href='3D"http://fawper.xyz/corruptly/676197486/trout/gen=' ne="" style='3D"-webkit-text-size-adjust:no='> <font style='3D"-webkit-text-size-adjust:none;fon=' t-size:15px=""

So then find_links outputs:

'http://fawper.xyz/corruptly/676197486/trout/gen='

When I want it to output:

'http://fawper.xyz/corruptly/676197486/trout/generalizing/1683814388/upgather/disjoin'

I've tried using html.parser and html5lib in place of lxml, but that didn't solve it. Could it be the encoding of the specific email that I'm parsing?


Solution

  • Swapping the soup block with a part of lastchancexi's answer, which used the Email module to get its payload based on the content type, gave me the desired output.

    def body_to_text(self):
            text = ""
            html = ""
            with open(self.email, "rb") as email_file:
                email_message = email.message_from_binary_file(email_file)
    
            if not email_message.is_multipart():
                content_type = email_message.get_content_type()
    
                if content_type == "text/plain":
                    text += str(email_message.get_payload(decode=True))
                    self.find_urls(text)
                elif content_type == "text/html":
                    html += str(email_message.get_payload(decode=True))
                    self.find_urls(html)
            else:
                with open(self.email, 'rb') as fp:
                    msg = BytesParser(policy=policy.default).parse(fp)
    
                try:
                    body_text = msg.get_body(preferencelist=('plain')).get_content().strip()
                except AttributeError:
                    print("No body found")
                else:
                    body_text = body_text.replace("\n", "")
                    
                    if body_text == "":
                        print("No body found")
                    else:
                        self.find_urls(body_text)