In Python, if you have a loop, in each iteration of which you want to write to a file (pickling in my case), overwriting whatever data is already there, one option is to open the file before the loop, keep it open, and truncate it on each iteration to erase the previous data before writing new data:
import pickle
with open(filename, 'wb') as file:
for blah in blahs:
file.truncate(0)
file.seek(0)
pickle.dump(blah, file)
and another is to just re-open the file on each iteration, as opening it in wb
automatically truncates it:
import pickle
for blah in blahs:
with open(filename, 'wb') as file:
pickle.dump(blah, file)
Which is best (in terms of performance/speed and handling system resources etc)? Is there a better way to overwrite data in an already-open file than using file.truncate()
and file.seek()
as above?
I'm aware a similar question has been asked (Is it better to open/close a file every time vs keeping it open until the process is finished?) but there it appears to be about when you want to append on each iteration rather than overwrite, so I'm wondering if the process of truncating etc in the latter results in any significance performance degradation that would tip the scales?
I don't like guessing so I profiled the two approaches:
import pickle
import tempfile
from random import choices
from string import ascii_lowercase, ascii_uppercase, digits
from pathlib import Path
from performance_measurement import run_performance_comparison
class Bla:
def __init__(self):
population = ascii_uppercase + digits + ascii_lowercase
self._content = str.join("", choices(population, k=50))
def truncate_approach(blahs: list[Bla], filename: str):
with open(filename, "wb") as file:
for blah in blahs:
file.truncate(0)
file.seek(0)
pickle.dump(blah, file)
def reopen_approach(blahs: list[Bla], filename: str):
for blah in blahs:
with open(filename, "wb") as file:
pickle.dump(blah, file)
def setup(N):
return [[Bla() for i in range(N)], Path(tempfile.NamedTemporaryFile().name)]
run_performance_comparison(
approaches=[truncate_approach, reopen_approach],
data_size=[10, 20, 30, 100, 200, 300, 1000, 2000, 3000],
setup=setup,
number_of_repetitions=10,
)
truncate_approach
is slightly faster. I assume it's because we interact with the disk less and sometimes get to truncate the content and re-set the writebuffer before we have to interact with the hard disc.
Profiling code:
import timeit
from functools import partial
import matplotlib.pyplot as plt
from typing import List, Dict, Callable
from contextlib import contextmanager
import matplotlib.pyplot as plt
import matplotlib.transforms as mtransforms
import matplotlib.ticker as ticker
import numpy as np
@contextmanager
def data_provider(data_size, setup=lambda N: N, teardown=lambda: None):
data = setup(data_size)
yield data
teardown(*data)
def run_performance_comparison(approaches: List[Callable],
data_size: List[int],
*,
setup=lambda N: [N],
teardown=lambda *N: None,
number_of_repetitions=5,
title='Performance Comparison',
data_name='N',
yscale='log',
xscale='log'):
approach_times: Dict[Callable, List[float]] = {approach: [] for approach in approaches}
for N in data_size:
with data_provider(N, setup, teardown) as data:
print(f'Running performance comparison for {data_name}={N}')
for approach in approaches:
function = partial(approach, *data)
approach_time = min(timeit.Timer(function).repeat(repeat=number_of_repetitions, number=1))
approach_times[approach].append(approach_time)
for approach in approaches:
plt.plot(data_size, approach_times[approach], label=approach.__name__)
plt.yscale(yscale)
plt.xscale(xscale)
plt.xlabel(data_name)
plt.ylabel('Execution Time (seconds)')
plt.title(title)
plt.legend()
plt.show()