pythonpytorchhyperparameters

Pytorch HyperParams tuning model persistance through training problem


During my hyperparams finding loop operation, I can see that my model seems to persist during init phase and so relauching epoch from the last one instead of starting a full new cycle.

I'm trying to perform a regression problem with a simple Dense on a CP then I'll go to GPU.

I turn on Python 3.10.4 pytorch 2.3.0+cu118

import gc
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import ParameterGrid

def set_seeds(seed):
    """
    MAJ :04/05
    """
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    # tf.config.experimental.enable_op_determinism()
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False
    
set_seeds(SEED)


class EarlyStopping: # For stopping training when val_loos improve no more
  def __init__(self, patience=25, min_delta=0):
    self.patience = patience
    self.min_delta = min_delta
    self.best_val_loss = float("inf")
    self.counter = 0

  def should_stop(self, val_loss):
    if val_loss < self.best_val_loss - self.min_delta:
        self.best_val_loss = val_loss
        self.counter = 0
    else:
        self.counter += 1
        if self.counter >= self.patience:
            return True
    return False


def weights_init(module):
    if module.__class__.__name__.find('Linear') != -1:
        # apply a uniform distribution to the weights and a bias=0
        module.weight.data.uniform_(0.0, 1/np.sqrt(module.in_features))
        module.bias.data.fill_(0)

class Dense_Regressor_RELU(nn.Module):  # My simple regressor model
  def __init__(self,input_Features_nb,Model_params,Target_Number=3):
    super().__init__()
    self.params=Model_params
    #Génération des couches dense empilées
    dense_layers=OrderedDict() #On crée un dictionnaire de tuples ordonnés
    for couche in range(1,Model_params['layer_number']):
      dense_layers["Dense"+str(couche)]=nn.Linear(Model_params['Neurons_number'], Model_params['Neurons_number']) #Input shape, Neurons Nb
      dense_layers["Norm"+str(couche)]=nn.BatchNorm1d(Model_params['Neurons_number'])
      dense_layers["Act"+str(couche)]=nn.ReLU()
      dense_layers["Drop"+str(couche)]=nn.Dropout(Model_params['Drop_rate'])

    self.input_layer=nn.Sequential(
                                    nn.Linear(input_Features_nb, Model_params['Neurons_number']) #Input shape, Neurons Nb
                                    ,nn.BatchNorm1d(Model_params['Neurons_number'])
                                    ,nn.ReLU()
                                    ,nn.Dropout(Model_params['Drop_rate'])
                                    )
    self.dense_layer=nn.Sequential(dense_layers)
    self.decision_layer=nn.Linear(Model_params['Neurons_number'], Target_Number)
    # self.cuda()

  def forward(self, x):
    x=self.input_layer(x)
    x=self.dense_layer(x)
    x=self.decision_layer(x)
    return x


support="ELEC"
Test_Window=25
Model_Verbose=2
Loop_params={        'RNN_ACT':'',  
                    'RNN_neurons': 0,
                    'RNN_layers': 0,
                    'RNN_drop': 0,
                    "RNN_bidir":False,
                    'Time_Layering':0,#4,8,12
                    'Time_Layer_Order':"Act_to_Past",#"Past_to_Act"
                    "layer_number":4,
                    "Neurons_number":128,
                    "Drop_rate":0.3,
                    "Epochs_number":500,
                    "batch_size":128,
                    "Cutter__Compo":100,
                    'Min_Max_Loss_Weight':5,
                    'OPTI_LR':0.001
                    }
Params_New={
                    'Model_type':"DL",
                    'Model_output_type':'multiple', #or Multiple if only one model to predict close min & max
                    'Model_pred_type':'evolution', # or absolute, evolution if prediction on % evolution, full_evolution if dropping all absolute values
                    'Inputs':["Evolution_Past","Add_FinancialV3","Add_Period","Add_Croisement","Add_DELTA","Add_Correlation"],
                    'Model_Params':{}
                    }
Params_New['Model_Params'].update(**Loop_params)
#Oracle is my object containing data loading, cleaning transforming etc... 
Oracle_Support=Oracle_custom(support,Custom_dict=Params_New,API_CALL=False,Test_Window=Test_Window)
Oracle_Support.Load_And_Clean_Data()
Oracle_Support.Pipe_Fit()
X_Train,Y_Train,X_Test,Y_Test=Oracle_Support.Return_Train_Test(To_Tensor=True,Transform=True)



#Model Init
Model_params=Loop_params
model=Dense_Regressor_RELU(input_Features_nb=X_Train.shape[1],Model_params=Model_params)
loss_fn = nn.MSELoss()  
optimizer = optim.Adam(model.parameters(), lr=Model_params['OPTI_LR'])
early_stopping = EarlyStopping(patience=25)


### Trainning Loop
best_val_loss = float("inf")
best_epoch=0
model.apply(weights_init)
    # https://pytorch.org/torcheval/main/metric_example.html
for epoch in range(Model_params['Epochs_number']):
    model.train()
    for i in range(0, len(X_Train), Model_params['batch_size']):
      Xbatch = X_Train[i:i+Model_params['batch_size']]
      ybatch = Y_Train[i:i+Model_params['batch_size']]
      y_pred = model(Xbatch)
      loss = Model_params['Min_Max_Loss_Weight']*loss_fn(y_pred[:,1], ybatch[:,1])+Model_params['Min_Max_Loss_Weight']*loss_fn(y_pred[:,2], ybatch[:,2])+loss_fn(y_pred[:,2], ybatch[:,2])
      optimizer.zero_grad(set_to_none=True)
      loss.backward()
      optimizer.step()
      
      
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
      y_pred = model(X_Test)
      val_loss = loss_fn(y_pred[:,1], Y_Test[:,1])+loss_fn(y_pred[:,2], Y_Test[:,2])+loss_fn(y_pred[:,2], Y_Test[:,2])
        
    val_loss /= X_Test.shape[0]
    if Model_Verbose>1:
      print(f"Époque {epoch}, latest loss {loss}, val_loss: {val_loss:.4f}")

    # Best Model Saving
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_state_dict = model.state_dict()
        if Model_Verbose>1:
          print("Meilleur modèle trouvé")

    #Early Stopping
    if early_stopping.should_stop(val_loss):
        if Model_Verbose>0:
          print(f'{support} Arrêt anticipé, Meilleure Epoch : {best_epoch} // Avec Val Loss : {best_val_loss}')
        break
    else :
      best_epoch=epoch

# Restaurer le meilleur modèle
if best_model_state_dict:
    model.load_state_dict(best_model_state_dict)
    # self.Best_Epoch=best_epoch
    if Model_Verbose>1:
      print("Meilleur modèle chargé !")
else:
    print("Aucun modèle à récupérer.")

# variables deleting
del best_model_state_dict,best_epoch
del X_Train,X_Test,Y_Train,Y_Test
del Xbatch, ybatch,loss_fn,optimizer,early_stopping,loss
del model
gc.collect()
#### Fin Model TRAINING

When i'm launching this code, I got this results :

First time lauching =>
NVIDIA Arrêt anticipé, Meilleure Epoch : 25 // Avec Val Loss : 0.0004996684729121625

Second =>
NVIDIA Arrêt anticipé, Meilleure Epoch : 24 // Avec Val Loss : 0.0005961030256003141

Third=>
NVIDIA Arrêt anticipé, Meilleure Epoch : 61 // Avec Val Loss : 0.0003799324913416058

If I restard Kernel :

First time lauching =>
NVIDIA Arrêt anticipé, Meilleure Epoch : 25 // Avec Val Loss : 0.0004996684729121625

Second =>
NVIDIA Arrêt anticipé, Meilleure Epoch : 24 // Avec Val Loss : 0.0005961030256003141

Third=>
NVIDIA Arrêt anticipé, Meilleure Epoch : 61 // Avec Val Loss : 0.0003799324913416058

I don't understand how the model it continuing training instead of restarting. Only restarting Kernel is making me back to the first time result.

Please help ! :)

Moving from GPU to CPU, Destroying my containers, only restarting kernel between loops is working...

How to ensure model and all hidden variables are killed between loops to do a safe hyper params tuning loops ?


Solution

  • I've solved my problem !!!

    here is the point : randwom seeds needs to be set inside the hyperparams loop

    Means =>

    Hyperparams dict set 
    
    For params in hyper params :
    set_seeds(SEED)
    #Model Init
    Model_params=Loop_params
    model=Dense_Regressor_RELU(input_Features_nb=X_Train.shape[1],Model_params=Model_params)
    loss_fn = nn.MSELoss()  
    optimizer = optim.Adam(model.parameters(), lr=Model_params['OPTI_LR'])
    early_stopping = EarlyStopping(patience=25)
    
    
    ### Trainning Loop
    best_val_loss = float("inf")
    best_epoch=0
    model.apply(weights_init)
        # https://pytorch.org/torcheval/main/metric_example.html
    for epoch in range(Model_params['Epochs_number']):
        model.train()
        for i in range(0, len(X_Train), Model_params['batch_size']):
          Xbatch = X_Train[i:i+Model_params['batch_size']]
          ybatch = Y_Train[i:i+Model_params['batch_size']]
          y_pred = model(Xbatch)
          loss = Model_params['Min_Max_Loss_Weight']*loss_fn(y_pred[:,1], ybatch[:,1])+Model_params['Min_Max_Loss_Weight']*loss_fn(y_pred[:,2], ybatch[:,2])+loss_fn(y_pred[:,2], ybatch[:,2])
          optimizer.zero_grad(set_to_none=True)
          loss.backward()
          optimizer.step()