pythonmemorymbox

python, reducing memory consumption and making this code more efficient?


I have written this code (It works - I tried it on small batch of MBOX file). However, when I try it on an MBOX file that has a size of 2.9 GB of about 50,000 mails, the memory consumption skyrocket, making the computer unusable. what is wrong with this code in term of memory consumption and is there is a way to fix it, such as making the code process by increments instead of a whole? the goal of this script, is to produce a CSV file with x as date and Y as the count of message received per that date, in order to plot them and produce a statistical representation of the Email. for future: I am planning to expand this, to read the email messages and produce an output on a pdf chronologically, so it needs to be sorted (where the memory consumption skyrocket)

    import mailbox
    from email.utils import parsedate
    from dateutil.parser import parse
    import itertools
    import plotly.plotly as py
    from plotly.graph_objs import *
    import plotly.tools as tls
    import csv
    from itertools import izip

    path = 'mail.mbox'
    mbox = mailbox.mbox(path)

    def extract_date(email):
        date = email.get('Date')
        return parsedate(date)

 #sort the email by a given date
    sorted_mails = sorted(mbox, key=extract_date)
    mbox.update(enumerate(sorted_mails))
    mbox.flush()

 #it finds all the dates within the MBOX and split
    all_dates = []
    mbox = mailbox.mbox(path)
    for message in mbox:
        all_dates.append( str( parse( message['date'] ) ).split(' ')[0] )

 #counts the number of emails per given date
    email_count = [(g[0], len(list(g[1]))) for g in itertools.groupby(all_dates)]
    email_count[0]

 #makes a list of (x,y)
    x = []
    y = []
    for date, count in email_count:
        x.append(date)
        y.append(count)

 #produce a CSV file of X and Y, for plotting
    with open('data.csv', 'wb') as f:
        writer = csv.writer(f)
        writer.writerows(izip(x, y))
   """
   data = Data([x, y])
   plot_url = py.iplot(Data, filename='line-scatter' )
  """
    py.iplot( Data([ Scatter( x=x, y=y ) ]) )

Solution

  • I'm not very familiar with these libraries, but I think the main issue is that you're reading all the messages into memory with this line:

    sorted_mails = sorted(mbox, key=extract_date)
    

    What is the goal of this script? Do you really need to sort anything? If you just need to produce a CSV with counts per date, try this:

    import mailbox
    from email.utils import parsedate
    from dateutil.parser import parse
    import itertools
    import plotly.plotly as py
    from plotly.graph_objs import *
    import plotly.tools as tls
    import csv
    from itertools import izip
    
    path = 'mail.mbox'
    mbox = mailbox.mbox(path)
    
    # map date to number of emails seen on that date
    date_counts = {}
    
    for message in mbox:
        date = str( parse( message['date'] ) ).split(' ')[0]
        try:
            date_counts[date] += 1
        except KeyError:
            date_counts[date] = 1
    
    with open('data.csv', 'wb') as f:
        writer = csv.writer(f)
        for date in date_counts:
            writer.writerow([date, date_counts[date]])