python-3.xmetadataemail-attachmentseml

How to parse an EML format file and extract meta-data information


I have an EML file with some attachments. I want to read the text content of the EML file extract the meta-data, such as: sender, from, cc, bcc, subject. I also want to download the attachments. With the help of the code below I am only able to extract the information/ text content in the body of the email.

import email
from email import policy
from email.parser import BytesParser
import glob
file_list = glob.glob('*.eml') # returns list of files
with open(file_list[2], 'rb') as fp:  # select a specific email file from the list
    msg = BytesParser(policy=policy.default).parse(fp)
text = msg.get_body(preferencelist=('plain')).get_content()
print(text)

There was module name emaildata which was available for Python 2 did the job.

Extracting MetaData Information

import email
from emaildata.metadata import MetaData

message = email.message_from_file(open('message.eml'))
extractor = MetaData(message)
data = extractor.to_dict()
print data.keys()

Extracting Attachment Information

import email
from emaildata.attachment import Attachment

message = email.message_from_file(open('message.eml'))
for content, filename, mimetype, message in Attachment.extract(message):
    print filename
    with open(filename, 'w') as stream:
        stream.write(content)
    # If message is not None then it is an instance of email.message.Message
    if message:
        print "The file {0} is a message with attachments.".format(filename)

But this library is now deprecated. Is there any other library that could extract the meta-data and attachment related information?


Solution

  • Meta-data information could be accessed using below code in Python 3.x

    from email import policy
    from email.parser import BytesParser
    with open(eml_file, 'rb') as fp:
        msg = BytesParser(policy=policy.default).parse(fp)
    
    print('To:', msg['to'])
    print('From:', msg['from'])
    print('Subject:', msg['subject'])
    

    Remaining header informations could be accessed using msg.keys()

    For downloading attachments from an eml file you can use the below code:

    import sys
    import os
    import os.path
    from collections import defaultdict
    from email.parser import Parser
    
    eml_mail = 'your eml file'
    output_dir = 'mention the directory where you want the files to be download'
    
    def parse_message(filename):
        with open(filename) as f:
            return Parser().parse(f)
    
    def find_attachments(message):
        """
        Return a tuple of parsed content-disposition dict, message object
        for each attachment found.
        """
        found = []
        for part in message.walk():
            if 'content-disposition' not in part:
                continue
            cdisp = part['content-disposition'].split(';')
            cdisp = [x.strip() for x in cdisp]
            if cdisp[0].lower() != 'attachment':
                continue
            parsed = {}
            for kv in cdisp[1:]:
                key, val = kv.split('=')
                if val.startswith('"'):
                    val = val.strip('"')
                elif val.startswith("'"):
                    val = val.strip("'")
                parsed[key] = val
            found.append((parsed, part))
        return found
    
    def run(eml_filename, output_dir):
        msg = parse_message(eml_filename)
        attachments = find_attachments(msg)
        print ("Found {0} attachments...".format(len(attachments)))
        if not os.path.isdir(output_dir):
            os.mkdir(output_dir)
        for cdisp, part in attachments:
            cdisp_filename = os.path.normpath(cdisp['filename'])
            # prevent malicious crap
            if os.path.isabs(cdisp_filename):
                cdisp_filename = os.path.basename(cdisp_filename)
            towrite = os.path.join(output_dir, cdisp_filename)
            print( "Writing " + towrite)
            with open(towrite, 'wb') as fp:
                data = part.get_payload(decode=True)
                fp.write(data)
    
    
    run(eml_mail, output_dir)