pythondeep-learningpytorchearly-stopping

How to load early stopping counter in pytorch


import numpy as np
import torch

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt'):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, models_dict):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, models_dict)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, models_dict)
            self.counter = 0

    def save_checkpoint(self, val_loss, models_dict):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        for file_name in models_dict.keys():
            torch.save(models_dict[file_name], self.path + "/" + file_name + ".pkl")
        self.val_loss_min = val_loss

The code above is the EarlyStopping class I'm using. I'm trying to make a image segmentation model using UNet. The problem is that I cannot afford to keep my runtime all day long. Using the codes below, I was able to save models itself every epochs, but whenever I reset my runtime, my early stopping counter get initialized to 1. After one epoch, a pkl file is made, but I wasn't able to find an answer how to start from that point. Is there a way that i can start from the counter i stopped?

def save(ckpt_dir, net, optim, epoch):
    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir)

    torch.save({'net': net.state_dict(), 'optim': optim.state_dict()},
               "%s/model_epoch%d.pth" % (ckpt_dir, epoch))


def load(ckpt_dir, net, optim):
    if not os.path.exists(ckpt_dir):
        epoch = 0
        return net, optim, epoch

    ckpt_lst = os.listdir(ckpt_dir)
    print(ckpt_lst)
    ckpt_lst.sort(key=lambda f: int(''.join(filter(str.isdigit, f))))
    dict_model = torch.load('%s/%s' % (ckpt_dir, ckpt_lst[-1]))
    print(dict_model.keys())

    net.load_state_dict(dict_model['net'])
    optim.load_state_dict(dict_model['optim'])
    epoch = int(ckpt_lst[-1].split('epoch')[1].split('.pth')[0])

    return net, optim, epoch

Solution

  • You can save the counter value along side the model state in save function:

    torch.save({'net': net.state_dict(), 'optim': optim.state_dict(), 'es_counter': early_stopping.counter},
               "%s/model_epoch%d.pth" % (ckpt_dir, epoch)
    

    Here, early_stopping is the object of class EarlyStopping. Now, you can load the counter value along with model state in the load function:

    es_counter = model_dict['es_counter']
    #...
    
    return net, optim, epoch, es_counter
    

    Now you can use the counter value to update early_stopping object in you main function.