import numpy as np
import torch
class EarlyStopping:
"""Early stops the training if validation loss doesn't improve after a given patience."""
def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt'):
"""
Args:
patience (int): How long to wait after last time validation loss improved.
Default: 7
verbose (bool): If True, prints a message for each validation loss improvement.
Default: False
delta (float): Minimum change in the monitored quantity to qualify as an improvement.
Default: 0
path (str): Path for the checkpoint to be saved to.
Default: 'checkpoint.pt'
"""
self.patience = patience
self.verbose = verbose
self.counter = 0
self.best_score = None
self.early_stop = False
self.val_loss_min = np.Inf
self.delta = delta
self.path = path
def __call__(self, val_loss, models_dict):
score = -val_loss
if self.best_score is None:
self.best_score = score
self.save_checkpoint(val_loss, models_dict)
elif score < self.best_score + self.delta:
self.counter += 1
print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
if self.counter >= self.patience:
self.early_stop = True
else:
self.best_score = score
self.save_checkpoint(val_loss, models_dict)
self.counter = 0
def save_checkpoint(self, val_loss, models_dict):
'''Saves model when validation loss decrease.'''
if self.verbose:
print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...')
for file_name in models_dict.keys():
torch.save(models_dict[file_name], self.path + "/" + file_name + ".pkl")
self.val_loss_min = val_loss
The code above is the EarlyStopping class I'm using. I'm trying to make a image segmentation model using UNet. The problem is that I cannot afford to keep my runtime all day long. Using the codes below, I was able to save models itself every epochs, but whenever I reset my runtime, my early stopping counter get initialized to 1. After one epoch, a pkl file is made, but I wasn't able to find an answer how to start from that point. Is there a way that i can start from the counter i stopped?
def save(ckpt_dir, net, optim, epoch):
if not os.path.exists(ckpt_dir):
os.makedirs(ckpt_dir)
torch.save({'net': net.state_dict(), 'optim': optim.state_dict()},
"%s/model_epoch%d.pth" % (ckpt_dir, epoch))
def load(ckpt_dir, net, optim):
if not os.path.exists(ckpt_dir):
epoch = 0
return net, optim, epoch
ckpt_lst = os.listdir(ckpt_dir)
print(ckpt_lst)
ckpt_lst.sort(key=lambda f: int(''.join(filter(str.isdigit, f))))
dict_model = torch.load('%s/%s' % (ckpt_dir, ckpt_lst[-1]))
print(dict_model.keys())
net.load_state_dict(dict_model['net'])
optim.load_state_dict(dict_model['optim'])
epoch = int(ckpt_lst[-1].split('epoch')[1].split('.pth')[0])
return net, optim, epoch
You can save the counter value along side the model state in save function:
torch.save({'net': net.state_dict(), 'optim': optim.state_dict(), 'es_counter': early_stopping.counter},
"%s/model_epoch%d.pth" % (ckpt_dir, epoch)
Here, early_stopping
is the object of class EarlyStopping
. Now, you can load the counter value along with model state in the load function:
es_counter = model_dict['es_counter']
#...
return net, optim, epoch, es_counter
Now you can use the counter value to update early_stopping
object in you main function.